Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,4 @@ Have fun!</br>

Daniel Wang</br>
2015年 8 月 23 日</br>

------------------------------
------------------------------
128 changes: 89 additions & 39 deletions jieba/__init__.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,50 @@
from __future__ import absolute_import, unicode_literals
from __future__ import absolute_import, unicode_literals #强行引进特性,防止兼容问题
__version__ = '0.37'
__license__ = 'MIT'
import re
import os
import sys
import time
import logging
import marshal
import tempfile
import threading
from math import log
from hashlib import md5
import marshal #序列与反序列化
import tempfile #临时文件
import threading #多线程
from math import log #求对数(默认为e为底)
from hashlib import md5 #md5摘要算法
from ._compat import *
from . import finalseg
from . import finalseg

if os.name == 'nt':
from shutil import move as _replace_file
if os.name == 'nt': #windows:移动 linux:重命名
from shutil import move as _replace_file
else:
_replace_file = os.rename

_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
os.path.dirname(__file__), path))
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), #获取模块的绝对距离
os.path.dirname(__file__), path))
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path)) #获取当前绝对路径


DEFAULT_DICT = _get_module_path("dict.txt") #默认字典

DEFAULT_DICT = _get_module_path("dict.txt")
#设置logging
log_console = logging.StreamHandler(sys.stderr)
default_logger = logging.getLogger(__name__)
default_logger.setLevel(logging.DEBUG)
default_logger.addHandler(log_console)

DICT_WRITING = {}
DICT_WRITING = {} #字典警告

pool = None

re_eng = re.compile('[a-zA-Z0-9]', re.U)
re_eng = re.compile('[a-zA-Z0-9]', re.U) #英文匹配

# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
# \r\n|\s : whitespace characters. Will not be handled.
# 注意正则表达式是用括号括起来的,即捕获括号
# 用括号将正则表达式括起来,那么匹配的字符串也会被列入到list中返回
#re.U是使得compile的结果取决于unicode定义的字符属性
# 精准模式正则
# re.U是使得compile的结果取决于unicode定义的字符属性

# 精准模式正则
re_han_default = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U)# 汉字码、非空白字符
re_skip_default = re.compile("(\r\n|\s)", re.U) # 换行或空白

Expand All @@ -50,12 +53,20 @@
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)# 字母数字+#

def setLogLevel(log_level):
"""
设置日志等级
"""
global logger
default_logger.setLevel(log_level)

class Tokenizer(object):

"""
标记生成类
"""
def __init__(self, dictionary=DEFAULT_DICT):
"""
设置进程锁,字典路径,频率,综合词数,用户字典标记表,初始化标签,临时目录和缓存文件
"""
self.lock = threading.RLock()
self.dictionary = _get_abs_path(dictionary)
self.FREQ = {}
Expand All @@ -72,6 +83,9 @@ def __repr__(self):
https://github.com/fxsjy/jieba/pull/187
'''
def gen_pfdict(self, f_name):
"""
加载数据生成字典
"""
lfreq = {} # 字典存储 词条:出现次数
ltotal = 0 # 所有词条的总的出现次数
with open(f_name, 'rb') as f: # 打开文件 dict.txt
Expand All @@ -84,14 +98,20 @@ def gen_pfdict(self, f_name):
ltotal += freq
for ch in xrange(len(word)):# 处理word的前缀
wfrag = word[:ch + 1]
if wfrag not in lfreq: # word前缀不在lfreq则其出现频次置0
if wfrag not in lfreq: # word前缀不在则lfreq,其出现频次置0 例:{"不" : 0, "不拘" :0 ,"不拘一" , 0, "不拘一格" : freq}
lfreq[wfrag] = 0
except ValueError:
raise ValueError(
'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
return lfreq, ltotal

def initialize(self, dictionary=None):
"""
初始化函数
"""

#字典存在并相等则结束,否则局部字典覆盖类字典并回滚初始化标记
#字典不存在类字典覆盖局部字典
if dictionary:
abs_path = _get_abs_path(dictionary)
if self.dictionary == abs_path and self.initialized:
Expand All @@ -102,6 +122,7 @@ def initialize(self, dictionary=None):
else:
abs_path = self.dictionary


with self.lock:
try:
with DICT_WRITING[abs_path]:
Expand Down Expand Up @@ -166,59 +187,78 @@ def initialize(self, dictionary=None):
default_logger.debug("Prefix dict has been built succesfully.")

def check_initialized(self):
"""
确保初始化
"""
if not self.initialized:
self.initialize()

#动态规划,计算最大概率的切分组合
#动态规划,计算最大概率的切分组合
def calc(self, sentence, DAG, route):
N = len(sentence)
route[N] = (0, 0)
# 对概率值取对数之后的结果(可以让概率相乘的计算变成对数相加,防止相乘造成下溢)
# 对概率值取对数之后的结果(可以让概率相乘的计算变成对数相加,防止相乘造成下溢)
logtotal = log(self.total)
# 从后往前遍历句子 反向计算最大概率
for idx in xrange(N - 1, -1, -1):
# 从DAG图反向遍历
# 列表推倒求最大概率对数路径
# route[idx] = max([ (概率对数,词语末字位置) for x in DAG[idx] ])
# 以idx:(概率对数最大值,词语末字位置)键值对形式保存在route中
# self.FREQ.get(sentence[idx:x+1]) 为句子中最后第二位到末尾的值,取他们的频率,如果没有取1
# 以{idx:(概率对数最大值,词语末字位置)}键值对形式保存在route中
# route[x+1][0] 表示 词路径[x+1,N-1]的最大概率对数,
# [x+1][0]即表示取句子x+1位置对应元组(概率对数,词语末字位置)的概率对数
route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
logtotal + route[x + 1][0], x) for x in DAG[idx])
route[idx] = max(
(log(self.FREQ.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x)
for x in DAG[idx] #为已idx为开始,x结束的片段
)

# DAG中是以{key:list,...}的字典结构存储
# key是字的开始位置
# 输入的是一段文本
def get_DAG(self, sentence):
"""
双重遍历句子,出现在频率字典中的词语挑出,形成节点被按顺序赋予到DAG中
DAG:把前进的顺序当作主干,链接的词语就是叶子,但叶子可以被主干的不同地方多次链接
当作图来看就是DAG
"""
self.check_initialized()
DAG = {}
N = len(sentence)
for k in xrange(N):
for k in xrange(N): #依次遍历文本中的每个位置,K为第一下标
tmplist = []
i = k
frag = sentence[k]
while i < N and frag in self.FREQ:
if self.FREQ[frag]:
tmplist.append(i)
i = k #设置I默认为K,第二下标
frag = sentence[k] #设置片段开始,下标为K
while i < N and frag in self.FREQ: #判断该片段是否在前缀词典中,如果不在则跳出,说明该片段已经超出该词的长度
if self.FREQ[frag]: #如果该片段大于零,即无前缀,加入DAG中,否则继续循环
tmplist.append(i)
i += 1
frag = sentence[k:i + 1]
if not tmplist:
frag = sentence[k:i + 1] #新的片段较左边加一个字
if not tmplist: #如果都有前缀,压入下标本身(适用于字典末尾?
tmplist.append(k)
DAG[k] = tmplist
DAG[k] = tmplist #默认赋予的是词语的下标开始,值为已I结尾的词语
return DAG

def __cut_all(self, sentence):
"""
详细切分文本中的句子
"""
dag = self.get_DAG(sentence)
old_j = -1
for k, L in iteritems(dag):
if len(L) == 1 and k > old_j:
yield sentence[k:L[0] + 1]
old_j = L[0]
for k, L in iteritems(dag): #返回下标和列表 (迭代器类型
if len(L) == 1 and k > old_j: #如果是是单字
yield sentence[k:L[0] + 1] #L[0]为K,返回单字
old_j = L[0] #重新设置标签
else:
for j in L:
for j in L: #如果不是单字,挨个返回
if j > k:
yield sentence[k:j + 1]
old_j = j

def __cut_DAG_NO_HMM(self, sentence):
"""
利用已有的字典组成DAG,然后动态规划行成最大概率切分
"""
DAG = self.get_DAG(sentence)
route = {}
self.calc(sentence, DAG, route)
Expand Down Expand Up @@ -281,7 +321,8 @@ def __cut_DAG(self, sentence):
else:
for elem in buf:
yield elem
#jieba分词的主函数,返回结果是一个可迭代的 generator

#jieba分词的主函数,返回结果是一个可迭代的 generator
def cut(self, sentence, cut_all=False, HMM=True):
'''
The main function that segments an entire sentence that contains
Expand Down Expand Up @@ -327,6 +368,7 @@ def cut(self, sentence, cut_all=False, HMM=True):
yield x

# 分词的(HMM or no HMM)的基础上,对长词再次切分
# 利用字频
def cut_for_search(self, sentence, HMM=True):
"""
Finer segmentation for search engines.
Expand Down Expand Up @@ -356,6 +398,7 @@ def lcut_for_search(self, *args, **kwargs):
_lcut = lcut
_lcut_for_search = lcut_for_search

#返回list类型
def _lcut_no_hmm(self, sentence):
return self.lcut(sentence, False, False)

Expand All @@ -368,6 +411,7 @@ def _lcut_for_search_no_hmm(self, sentence):
def get_abs_path_dict(self):
return _get_abs_path(self.dictionary)

#加载用户字典
def load_userdict(self, f):
'''
Load personalized dict to improve detect rate.
Expand Down Expand Up @@ -404,6 +448,7 @@ def load_userdict(self, f):

def add_word(self, word, freq=None, tag=None):
"""
增加字到字典,freq与tag可省略
Add a word to dictionary.
freq and tag can be omitted, freq defaults to be a calculated value
that ensures the word can be cut out.
Expand All @@ -426,6 +471,7 @@ def del_word(self, word):
"""
self.add_word(word, 0)

#建议频率
def suggest_freq(self, segment, tune=False):
"""
Suggest word frequency to force the characters in a word to be
Expand Down Expand Up @@ -455,6 +501,7 @@ def suggest_freq(self, segment, tune=False):
add_word(word, freq)
return freq


def tokenize(self, unicode_sentence, mode="default", HMM=True):
"""
Tokenize a sentence and yields tuples of (word, start, end)
Expand Down Expand Up @@ -496,6 +543,7 @@ def set_dictionary(self, dictionary_path):
self.initialized = False


#
# default Tokenizer instance

dt = Tokenizer()
Expand Down Expand Up @@ -566,6 +614,7 @@ def _pcut_for_search(sentence, HMM=True):

def enable_parallel(processnum=None):
"""
#开启多线程版本?
Change the module's `cut` and `cut_for_search` functions to the
parallel version.
Note that this only works using dt, custom Tokenizer
Expand All @@ -587,6 +636,7 @@ def enable_parallel(processnum=None):


def disable_parallel():
#禁用多线程?
global pool, dt, cut, cut_for_search
if pool:
pool.close()
Expand Down
1 change: 1 addition & 0 deletions jieba/analyse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""default analyse"""
from __future__ import absolute_import
from .tfidf import TFIDF
from .textrank import TextRank
Expand Down
12 changes: 11 additions & 1 deletion jieba/analyse/textrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,23 @@


class UndirectWeightedGraph:
"""
无向直接加权图
"""
d = 0.85

def __init__(self):
self.graph = defaultdict(list)

def addEdge(self, start, end, weight):
def addEdge(self, start, end, weight):
# use a tuple (start, end, weight) instead of a Edge object
self.graph[start].append((start, end, weight))
self.graph[end].append((end, start, weight))

def rank(self):
"""

"""
ws = defaultdict(float)
outSum = defaultdict(float)

Expand Down Expand Up @@ -55,6 +61,10 @@ def rank(self):


class TextRank(KeywordExtractor):
"""


"""

def __init__(self):
self.tokenizer = self.postokenizer = jieba.posseg.dt
Expand Down
Loading