ICTCLAS分词系统研究(九)--对最终结果做优化调整 - 中国搜索技术门户

推荐给好友 上一篇 | 下一篇

ICTCLAS分词系统研究(九)--对最终结果做优化调整

本站欢迎转载,但任何媒体、网站或个人转载使用时请注明来源:中国搜索门户http://www.cnsousuo.com/viewnews-1590

【中国搜索门户讯】
在研究(八)中,我们得到了最终的分词结果了,好兴奋呀。不过,还有临门一脚不能忘了,对一些特殊情况做处理。主要是对叠词(相邻的两个字或词一样)及个别词性进行合并处理。

比如,以“一片片的白云很好看”,他的最终分词结果是:

 经过优化后的分词结果:

序号分词结果
0一/m 片/q 片/q 的/uj 白云/n 很/d 好看/a

很显然,“一片片”应该为一个整体,没有拆分的必要,看源代码的调整过程:

//Adjust the result with some rules
bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet)
{
    
int i=0,j=0;
    unsigned 
int nLen;
    
char sSurName[10],sSurName2[10],sGivenName[10];
    
bool bProcessed=false;//Have been processed
    while(pItem[i].sWord[0]!=0)
    
{
        nLen
=strlen(pItem[i].sWord);
        bProcessed
=false;
        
        
//Rule1: adjust person name
        if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_uPerson.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr'
        {//Divide name into surname and given name
            
            
if(sSurName[0])
            
{
                strcpy(pItemRet[j].sWord,sSurName);
                pItemRet[j
++].nHandle=28274;
            }

            
if(sSurName2[0])
            
{
                strcpy(pItemRet[j].sWord,sSurName2);
                pItemRet[j
++].nHandle=28274;
            }

            
if(sGivenName[0])
            
{
                strcpy(pItemRet[j].sWord,sGivenName);
                pItemRet[j
++].nHandle=28274;
            }

            bProcessed
=true;
        }

        
//Rule2 for overlap words ABB 一段段、一片片
        else if(pItem[i].nHandle==27904&&strlen(pItem[i+1].sWord)==2&&strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0)
        
{//(pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&&
            strcpy(pItemRet[j].sWord,pItem[i].sWord);
            strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
            strcat(pItemRet[j].sWord,pItem[i
+2].sWord);
            pItemRet[j].nHandle
=27904;
            j
+=1;
            i
+=2;
            bProcessed
=true;
        }

        
//Rule3 for overlap words AA
        else if(nLen==2&&strcmp(pItem[i].sWord,pItem[i+1].sWord)==0)
        
{
            strcpy(pItemRet[j].sWord,pItem[i].sWord);
            strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
             
//24832=='a'*256
            pItemRet[j].nHandle=24832;//a
            if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
            {
                pItemRet[j].nHandle
=30208;
            }

            
if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256
            {
                pItemRet[j].nHandle
='n'*256;
            }
            
            i
+=1;
            
if(strlen(pItem[i+1].sWord)==2)
            
{//AAB:洗/洗/脸、蒙蒙亮
                if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')||
                   (pItemRet[j].nHandle
==24832&&pItem[i+1].nHandle/256=='a')
                   )
                
{
                    strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
                    i
+=1;
                }

            }

            j
+=1;
            bProcessed
=true;
        }


        
//Rule 4: AAB 洗/洗澡
        else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle==24832))//v,a
        {
            strcpy(pItemRet[j].sWord,pItem[i].sWord);
            strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
             
//24832=='a'*256
            pItemRet[j].nHandle=24832;//'a'
            if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
            {
                pItemRet[j].nHandle
=30208;
            }


            i
+=1;
            j
+=1;
            bProcessed
=true;
        }

        
else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u
            pItem[i].nHandle='u'*256;
        
else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&strncmp(pItem[i+1].sWord+2,pItem[i+2].sWord,2)==0)
        
{//AABB 朴朴素素 枝枝叶叶
                strcpy(pItemRet[j].sWord,pItem[i].sWord);
                strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
                strcat(pItemRet[j].sWord,pItem[i
+2].sWord);
                pItemRet[j].nHandle
=pItem[i+1].nHandle;
                i
+=2;
                j
+=1;
                bProcessed
=true;
        }

        
else if(pItem[i].nHandle==28275)//PostFix
        {
            
if(m_uPlace.m_dict.IsExist(pItem[i+1].sWord,4))
            
{
                strcpy(pItemRet[j].sWord,pItem[i].sWord);
                strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
                pItemRet[j].nHandle
=28275;
                i
+=1;
                j
+=1;
                bProcessed
=true;
            }

            
else if(strlen(pItem[i+1].sWord)==2&&CC_Find("",pItem[i+1].sWord))
            
{
                strcpy(pItemRet[j].sWord,pItem[i].sWord);
                strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
                pItemRet[j].nHandle
=28276;
                i
+=1;
                j
+=1;
                bProcessed
=true;
            }

            
else if(strlen(pItem[i+1].sWord)==2&&CC_Find("语文字杯",pItem[i+1].sWord))
            
{
                strcpy(pItemRet[j].sWord,pItem[i].sWord);
                strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
                pItemRet[j].nHandle
=28282;
                i
+=1;
                j
+=1;
                bProcessed
=true;
            }

            
else if(strlen(pItem[i+1].sWord)==2&&CC_Find("",pItem[i+1].sWord))
            
{
                strcpy(pItemRet[j].sWord,pItem[i].sWord);
                strcat(pItemRet[j].sWord,pItem[i
+1].sWord);
ICTCLAS ictclas 分词
 

评分:0

我来说两句

seccode