TNC - Thai National Corpus II - คลังข้อมุลภาษาไทยแห่งชาติ ๒ ในพระราชูปถัมภ์สมเด็จพระเทพรัตนราชสุดาฯ สยามบรมราชกุมารี
//print_r($_POST);
require_once 'inc/init.php';
$startTime_xxx=getMicrotime();
//init genre order
$tmp=query("select * from cl where cl_type='CL_G' and cl_cd<>'' order by cl_sub_cd");
for ($i=0;$i input parameter <--
// output : raw, mil
// sortby : freq, perc, alph
// p : word(s)
// w2 : context
// wl : left context
// wr : rifht context
// freq1 : support frequentcy
//init sum stat
$tmp=query("select sum(wrd_cnt) as sum_wrd_cnt from wrd_stat");
$sum_wrd_cnt['ALL']=$tmp[0]['sum_wrd_cnt'];
if (request('output')=='mil'){ //show stat in MIL
$tmp=query("select cl_genre, sum(wrd_cnt) as sum_wrd_cnt from wrd_stat group by cl_genre;");
for ($i=0;$i$val){
if ($data[$wrd_id]['TOT']>=$freq1){
if ($_POST['sortby']=="alph")
$aux[$wrd_id]=$data[$wrd_id]['wrd_txt'];
elseif ($_POST['sortby']=="perc")
$aux[$wrd_id]=$data[$wrd_id]['TOT'];
}
}
//step C : sort
if ($_POST['sortby']=="alph")
asort($aux);
elseif ($_POST['sortby']=="perc")
arsort($aux);
//print_x($aux);exit;
//step D : sum grand total + filter out LIMIT
$i=0;
foreach ($aux as $wrd_id=>$val){
if (++$i<=$limit){
foreach ($data[$wrd_id] as $a=>$b){
$data['GRAND'][$a]+=$data[$wrd_id][$a];
}
}else{
unset ($aux[$wrd_id]);
}
}
}
//*******************
//case of context
else{
$tmp=query("select wrd_id from wrd where wrd_txt like '$p'");
$context_flg=true;
if (count($tmp)>1){ //unaccepted case
echo "result > 1, error";
}
if (count($tmp)==0){// not found data case
echo "data not found";
}
if (count($tmp)==1) { //valid case
$wrd_id=$tmp[0]['wrd_id'];
//get total occurrence
$sql="select sum(wrd_cnt) as count from wrd_stat b
where true ".$_SESSION['filter_sql_1']." ".$_SESSION['filter_sql_2'];
$tmp=query1($sql);
$total_occurrence=$tmp['count']+0.000001;
//get total word
//$sql="select count(distinct(wrd_id)) as count from wrd_stat b
//where true ".$_SESSION['filter_sql_1']." ".$_SESSION['filter_sql_2'];
//$tmp=query1($sql);
//$total_word=$tmp['count']+0.000001;
//get stat of main word
$sql="select sum(wrd_cnt) as count from wrd a left join wrd_stat b using (wrd_id)
where wrd_txt like '$p' ".$_SESSION['filter_sql_1']." ".$_SESSION['filter_sql_2'];
$tmp=query1($sql);
$main_count=$tmp['count']+0.000001;
//get stat of collocations
$sql="
select b.ctx_id, a.wrd_txt wrd_txt, b.cl_genre, b.ctx_freq total
from precache_ctx b, wrd a
where b.wrd_id=$wrd_id
and b.wrd_dis in(-$wl,$wr) and b.ctx_id=a.wrd_id ".
$_SESSION['filter_sql_1']." ".$_SESSION['filter_sql_2']."
and a.wrd_txt like '$w2'";
//echo $sql," ";exit;
$tmp = mysql_query($sql) or die("Could not connect: " . mysql_error());
//$tmp=query($sql);
//print_x($tmp);
//step A : select count for each genre + sum total
while ($tmp2=mysql_fetch_array($tmp,MYSQL_ASSOC)){
$data[$tmp2['ctx_id']]['wrd_txt']=$tmp2['wrd_txt'];
$data[$tmp2['ctx_id']][$tmp2['cl_genre']]+=$tmp2['total'];
$data[$tmp2['ctx_id']]['TOT']+=$tmp2['total'];
}
//step B : filter out FREQ1 + calculate relavent
foreach ($data as $wrd_id=>$val){
if ($data[$wrd_id]['TOT']>=$freq1){
$tmp=query1("select sum(wrd_cnt) as x from wrd_stat b where wrd_id=$wrd_id ".
$_SESSION['filter_sql_1']." ".$_SESSION['filter_sql_2']);
$data[$wrd_id]['ALL']=$tmp['x'];
//CO Collocations
$data[$wrd_id]['CO']=$data[$wrd_id]['TOT']/$tmp['x'];
//MI n(a and b) * total_word / ( n(a) * n(b) * span )
$data[$wrd_id]['MI']=log( $data[$wrd_id]['TOT']*$total_occurrence/($tmp['x']*$main_count*($wl+$wr)) , 2);
//Dunning's Likelihood
$data[$wrd_id]['DL']=0;
$C12 = $data[$wrd_id]['TOT'];
$C1 = $tmp['x'];
$C2 = $main_count;
$a = $C12;
$b = $C1 - $C12;
$c = $C2 - $C12;
//$d = $total_word - $C1 - $C2 + $C12;
$d = $total_occurrence - $C1 - $C2 + $C12;
$ll = $a*log($a) + $b*log($b) + $c*log($c) + $d*log($d)
- ($a+$b)*log($a+$b) - ($a+$c)*log($a+$c) - ($b+$d)*log($b+$d) - ($c+$d)*log($c+$d)
+ ($a+$b+$c+$d)*log($a+$b+$c+$d);
$data[$wrd_id]['DL'] = 2*$ll;
// $p = $C2/$total_word;
// $p1 = $C12/$C1;
// $p2 = ($C2-$C12)/($total_word - $C1);
// if ($p == 1) { $p = 0.9999999; }
// if ($p1 == 1) { $p1 = 0.9999999; }
// if ($p2 == 0) { $p2 = 0.0000001; }
// $likelihood = ($C12*log($p) + ($C1-$C12) * log (1-$p)) +
// ( ($C2-$C12) * log($p)+ ($Totalword-$C1-$C2+$C12)*log(1-$p) ) -
// ($C12*log($p1) + ($C1-$C12)*log(1-$p1)) -
// ( ($C2-$C12) * log($p2)+ ($Totalword-$C1-$C2+$C12)*log(1-$p2) ) ;
// $likelihood = -2 * $likelihood;
// $data[$wrd_id]['DL']=$likelihood;
//choose stat
if ($_REQUEST['stat']=='Mutual Information'){
$data[$wrd_id]['REL']=$data[$wrd_id]['MI'];
}else{
$data[$wrd_id]['REL']=$data[$wrd_id]['DL'];
}
if ($_POST['sortby']=="alph")
$aux[$wrd_id]=$data[$wrd_id]['wrd_txt'];
elseif ($_POST['sortby']=="perc")
$aux[$wrd_id]=$data[$wrd_id]['REL'];
}else{
unset($data[$wrd_id]);
}
}
//step C : sort
if ($_POST['sortby']=="alph")
asort($aux);
elseif ($_POST['sortby']=="perc")
arsort($aux);
//step D : sum grand total + filter out LIMIT
$i=0;
foreach ($aux as $wrd_id=>$val){
if (++$i<=$limit){
foreach ($data[$wrd_id] as $a=>$b){
$data['GRAND'][$a]+=$data[$wrd_id][$a];
}
}else{
unset ($aux[$wrd_id]);
}
}
//print_x($data);
}
//exit;
}
function sort_perc($a, $b)
{
return ($b["relevance"]<$a["relevance"])?-1:1;
}
function sort_alph($a, $b)
{
return strcoll($a["word"],$b["word"]);
}
?>