将文本所有句子分词并返回结果的函数如下:
def tokenizer(sentences): results = list() for sentence in sentences: results.append(jieba.lcut(str(sentence).replace('\n', ''))) return results
java版分词和词性标注
package com.xiaomi.search.model.example import scala.collection.JavaConversions._ import com.huaban.analysis.jieba.JiebaSegmenter object PosTagging { def main(args: Array[String]): Unit = { val jiebaSegmenter = new JiebaSegmenter val sentences: Array[String] = Array[String]("长春市长春节致辞") for (sentence <- sentences) { val segList = jiebaSegmenter.process(sentence, JiebaSegmenter.SegMode.SEARCH) for (seg <- segList) { println(seg) } println(jiebaSegmenter.process(sentence, JiebaSegmenter.SegMode.SEARCH).toString) } } }
输出如下:
[长春, 0, 2, ns] [市长, 2, 4, n] [春节, 4, 6, t] [致辞, 6, 8, v]