`
thecloud
  • 浏览: 882670 次
文章分类
社区版块
存档分类
最新评论

自己写Lucene分词器原理篇——Analyzer简单讲解

 
阅读更多

内容提要:以ChineseAnalyzer为例,简单讲讲lucene分析器,也就是analyzer的分析过程

一:分析器原理

语料——>过滤器过滤——>tokeniner分词器分词——>词元——>放进字典(记录词元和位置信息)

二:代码分析

1:一共有5个类,第一个是ChineseAnalyzer分析器类,还有ChineseFilter过滤器类和它的工厂类,和ChineseTokenizer类和它的工厂类


2:ChineseAnalyzer类

public final class ChineseAnalyzer extends Analyzer {

    @Override
    protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
      final Tokenizer source = new ChineseTokenizer(reader);//new一个tokenizer
      return new TokenStreamComponents(source, new ChineseFilter(source));//把tokonizer和过滤器放入语汇流处理器组建中
    }
}

3:ChineseFilter类,默认按照空格来切割文档字词,主要处理停用词,和把英文字符长度为1的去掉

public final class ChineseFilter extends TokenFilter {


    // Only English now, Chinese to be added later.停用词,可以添加在这里
    public static final String[] STOP_WORDS = {
    "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "such",
    "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
    };


    private CharArraySet stopTable;

    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    
    public ChineseFilter(TokenStream in) {
        super(in);

        stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
    }

    @Override
    public boolean incrementToken() throws IOException {

        while (input.incrementToken()) {
            char text[] = termAtt.buffer();//以空格为截断符截取出来的的字符数组
            int termLength = termAtt.length();
//过滤器的主要功能,字符是先按照空格截取后的字符数组,先判断是不是在停用词里面,然后判断是不是英文字母,在判断是不是其他字符
          // why not key off token type here assuming ChineseTokenizer comes first?
            if (!stopTable.contains(text, 0, termLength)) {//是不是在停用词里面
                switch (Character.getType(text[0])) {

                case Character.LOWERCASE_LETTER://是不是引文字母
                case Character.UPPERCASE_LETTER:

                    // English word/token should larger than 1 character.
                    if (termLength>1) {//要是英文字母,且长度大于1才回返回给语汇处理器
                        return true;
                    }
                    break;
                case Character.OTHER_LETTER://要是其他字符,直接返回

                    // One Chinese character as one Chinese word.
                    // Chinese word extraction to be added later here.

                    return true;
                }

            }

        }
        return false;
    }

}
4:ChineseTokenizer类,是处理分词的

public final class ChineseTokenizer extends Tokenizer {
    public ChineseTokenizer(Reader in) {
      super(in);
    }
    public ChineseTokenizer(AttributeSource source, Reader in) {
      super(source, in);
    }
    public ChineseTokenizer(AttributeFactory factory, Reader in) {
      super(factory, in);
    }
       
    private int offset = 0, bufferIndex=0, dataLen=0;
    private final static int MAX_WORD_LEN = 255;
    private final static int IO_BUFFER_SIZE = 1024;
    private final char[] buffer = new char[MAX_WORD_LEN];
    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];


    private int length;
    private int start;
//处理后的词元写进这两个属性,一个记录词元,一个是记录位置信息
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);//记录词元
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);//记录位置信息
    	//本地的一个写缓冲区,要是英文就先写到这里,写完一个英文单词再写到termAtt
    private final void push(char c) {

        if (length == 0) start = offset-1;            // start of token
        buffer[length++] = Character.toLowerCase(c);  // buffer it

    }
//把词元和词元的位置信息写到字典,返回true是表示还有词需要继续处理,返回false表示此次输入的文档处理完毕
    private final boolean flush() {
//length是指写入词典的词元的长度
        if (length>0) {
            //System.out.println(new String(buffer, 0,
            //length));
          termAtt.copyBuffer(buffer, 0, length);
          offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
          return true;
        }
        else
            return false;
    }

    @Override
    public boolean incrementToken() throws IOException {
        clearAttributes();
//写完一个词元后,长度清零,新词元的起始位置从上一个词元的最后位置开始
        length = 0;//重置length
        start = offset;//把上一次的偏移量赋值成这一次的起始值


        while (true) {

            final char c;
            offset++;
//将输入流ioBuffer读出来,当bufferIndex>=dataLen的时候,也就是一个输入流被处理完的时候
//再读ioBuffer,dateLen就会等于-1,也就是input.read(ioBuffer)=-1
           if (bufferIndex >= dataLen) {
                 dataLen = input.read(ioBuffer);
                bufferIndex = 0;
            }
//如果dataLen等于-1,length是等于0的,进入flush,会直接返回false,就是该次输入的文档分析结束
           if (dataLen == -1){
                offset--; 
                return flush();
            }else c = ioBuffer[bufferIndex++];//取出输入流的字符 
              
            switch(Character.getType(c)) {//如果是数字和字母,就写入本地缓存,然后处理下一个字符,如果等于最大长度了,直接写入 
            case Character.DECIMAL_DIGIT_NUMBER: 
            case Character.LOWERCASE_LETTER: 
            case Character.UPPERCASE_LETTER: 
                 push(c); 
                 if (length == MAX_WORD_LEN)return flush(); 
                 break;
//如果是其他符号,要是有本地缓存,就先写本地缓存,再后退一次(避免数字,字母和其他字符变成一个词元写入),要是没有本地缓存就直接写入,
//保证了数字和字母结束后遇到其他符号,可以吧数字和字符完整写入,和其他字符也能正常写入
            case Character.OTHER_LETTER:
                if (length>0) {
                    bufferIndex--;
                    offset--;
                    return flush();
                }
                push(c);
                return flush();

            default:
                if (length>0) return flush();
                break;
            }
        }
    }
    
    @Override
    public final void end() {
      // set final offset
      final int finalOffset = correctOffset(offset);
      this.offsetAtt.setOffset(finalOffset, finalOffset);
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      offset = bufferIndex = dataLen = 0;
    }
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics