From 4b517afd2c22ad17be261c6818accfc416b15485 Mon Sep 17 00:00:00 2001 From: liudonghua Date: Sun, 2 Dec 2018 22:02:43 +0800 Subject: [PATCH] add raw option functionality --- lib/Segment.js | 38 ++++++++++++++++++------------ lib/module/PunctuationTokenizer.js | 5 +--- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/lib/Segment.js b/lib/Segment.js index 6818fd04..7d1f31c7 100644 --- a/lib/Segment.js +++ b/lib/Segment.js @@ -228,21 +228,29 @@ Segment.prototype.doSegment = function (text, options) { options = options || {}; var ret = []; - // 将文本按照换行符分割成多段,并逐一分词 - text.replace(/\r/g, '\n').split(/(\n|\s)+/).forEach(function (section) { - var section = section.trim(); - if (section.length < 1) return; - // ====================================== - // 分词 - var sret = me.tokenizer.split(section, me.modules.tokenizer); - - // 优化 - sret = me.optimizer.doOptimize(sret, me.modules.optimizer); - - // ====================================== - // 连接分词结果 - if (sret.length > 0) ret = ret.concat(sret); - }); + // 判断是否是raw,即不去除换行,空格,保留原格式 + if (options.raw) { + var rret = me.tokenizer.split(text, me.modules.tokenizer); + rret = me.optimizer.doOptimize(rret, me.modules.optimizer); + if (rret.length > 0) ret = ret.concat(rret); + } + else { + // 将文本按照换行符分割成多段,并逐一分词 + text.replace(/\r/g, '\n').split(/(\n|\s)+/).forEach(function (section) { + var section = section.trim(); + if (section.length < 1) return; + // ====================================== + // 分词 + var sret = me.tokenizer.split(section, me.modules.tokenizer); + + // 优化 + sret = me.optimizer.doOptimize(sret, me.modules.optimizer); + + // ====================================== + // 连接分词结果 + if (sret.length > 0) ret = ret.concat(sret); + }); + } // 去除标点符号 if (options.stripPunctuation) { diff --git a/lib/module/PunctuationTokenizer.js b/lib/module/PunctuationTokenizer.js index 529654ee..b5f3cc47 100644 --- a/lib/module/PunctuationTokenizer.js +++ b/lib/module/PunctuationTokenizer.js @@ -46,10 +46,7 @@ exports.split = function (words) { if (sw.c > lastc) { ret.push({w: word.w.substr(lastc, sw.c - lastc)}); } - // 忽略空格 - if (sw.w != ' ') { - ret.push({w: sw.w, p: POSTAG.D_W}); - } + ret.push({w: sw.w, p: POSTAG.D_W}); lastc = sw.c + sw.w.length; } var lastsw = stopinfo[stopinfo.length - 1];