ustcdane · 0x422 · Jul 18, 2018 · Jul 18, 2018 · Jul 25, 2018
diff --git a/README.md b/README.md
@@ -10,5 +10,4 @@ Have fun!</br>
 
 Daniel Wang</br>
 2015年 8 月 23 日</br>
-
-------------------------------
+------------------------------
diff --git a/jieba/__init__.py b/jieba/__init__.py
@@ -1,47 +1,50 @@
-from __future__ import absolute_import, unicode_literals
+from __future__ import absolute_import, unicode_literals    #强行引进特性,防止兼容问题
 __version__ = '0.37'
 __license__ = 'MIT'
 import re
 import os
 import sys
 import time
 import logging
-import marshal
-import tempfile
-import threading
-from math import log
-from hashlib import md5
+import marshal  #序列与反序列化
+import tempfile #临时文件
+import threading    #多线程
+from math import log    #求对数（默认为e为底）
+from hashlib import md5 #md5摘要算法
 from ._compat import *
-from . import finalseg
+from . import finalseg  
 
-if os.name == 'nt':
-    from shutil import move as _replace_file
+if os.name == 'nt':  #windows:移动  linux:重命名
+    from shutil import move as _replace_file    
 else:
     _replace_file = os.rename
 
-_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
-                                                 os.path.dirname(__file__), path))
-_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
+_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),      #获取模块的绝对距离
+                                                 os.path.dirname(__file__), path)) 
+_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))  #获取当前绝对路径
+
+
+DEFAULT_DICT = _get_module_path("dict.txt") #默认字典
 
-DEFAULT_DICT = _get_module_path("dict.txt")
 #设置logging
 log_console = logging.StreamHandler(sys.stderr)
 default_logger = logging.getLogger(__name__)
 default_logger.setLevel(logging.DEBUG)
 default_logger.addHandler(log_console)
 
-DICT_WRITING = {}
+DICT_WRITING = {}   #字典警告
 
 pool = None
 
-re_eng = re.compile('[a-zA-Z0-9]', re.U)
+re_eng = re.compile('[a-zA-Z0-9]', re.U)  #英文匹配
 
 # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
 # \r\n|\s : whitespace characters. Will not be handled.
 # 注意正则表达式是用括号括起来的,即捕获括号
 # 用括号将正则表达式括起来，那么匹配的字符串也会被列入到list中返回
- #re.U是使得compile的结果取决于unicode定义的字符属性 
- # 精准模式正则
+# re.U是使得compile的结果取决于unicode定义的字符属性 
+
+# 精准模式正则
 re_han_default = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U)# 汉字码、非空白字符
 re_skip_default = re.compile("(\r\n|\s)", re.U) # 换行或空白
 
@@ -50,12 +53,20 @@
 re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)# 字母数字+#
 
 def setLogLevel(log_level):
+    """
+    设置日志等级
+    """
     global logger
     default_logger.setLevel(log_level)
 
 class Tokenizer(object):
-
+    """
+    标记生成类
+    """
     def __init__(self, dictionary=DEFAULT_DICT):
+        """
+        设置进程锁，字典路径，频率，综合词数，用户字典标记表，初始化标签,临时目录和缓存文件
+        """
         self.lock = threading.RLock()
         self.dictionary = _get_abs_path(dictionary)
         self.FREQ = {}
@@ -72,6 +83,9 @@ def __repr__(self):
       https://github.com/fxsjy/jieba/pull/187
     '''
     def gen_pfdict(self, f_name):
+        """
+        加载数据生成字典
+        """
         lfreq = {} # 字典存储  词条:出现次数
         ltotal = 0 # 所有词条的总的出现次数
         with open(f_name, 'rb') as f: # 打开文件 dict.txt 
@@ -84,14 +98,20 @@ def gen_pfdict(self, f_name):
                     ltotal += freq
                     for ch in xrange(len(word)):# 处理word的前缀
                         wfrag = word[:ch + 1]
-                        if wfrag not in lfreq: # word前缀不在lfreq则其出现频次置0 
+                        if wfrag not in lfreq: # word前缀不在则lfreq，其出现频次置0 例：{"不" : 0, "不拘" :0 ,"不拘一" , 0, "不拘一格" : freq} 
                             lfreq[wfrag] = 0
                 except ValueError:
                     raise ValueError(
                         'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
         return lfreq, ltotal
 
     def initialize(self, dictionary=None):
+        """
+        初始化函数
+        """
+
+        #字典存在并相等则结束，否则局部字典覆盖类字典并回滚初始化标记
+        #字典不存在类字典覆盖局部字典
         if dictionary:
             abs_path = _get_abs_path(dictionary)
             if self.dictionary == abs_path and self.initialized:
@@ -102,6 +122,7 @@ def initialize(self, dictionary=None):
         else:
             abs_path = self.dictionary
 
+
         with self.lock:
             try:
                 with DICT_WRITING[abs_path]:
@@ -166,59 +187,78 @@ def initialize(self, dictionary=None):
             default_logger.debug("Prefix dict has been built succesfully.")
 
     def check_initialized(self):
+        """
+        确保初始化
+        """
         if not self.initialized:
             self.initialize()
 
-     #动态规划，计算最大概率的切分组合
+    #动态规划，计算最大概率的切分组合
     def calc(self, sentence, DAG, route):
         N = len(sentence)
         route[N] = (0, 0)
-         # 对概率值取对数之后的结果(可以让概率相乘的计算变成对数相加,防止相乘造成下溢)
+        # 对概率值取对数之后的结果(可以让概率相乘的计算变成对数相加,防止相乘造成下溢)
         logtotal = log(self.total)
         # 从后往前遍历句子 反向计算最大概率
         for idx in xrange(N - 1, -1, -1):
+           # 从DAG图反向遍历
            # 列表推倒求最大概率对数路径
            # route[idx] = max([ (概率对数，词语末字位置) for x in DAG[idx] ])
-           # 以idx:(概率对数最大值，词语末字位置)键值对形式保存在route中
+           # self.FREQ.get(sentence[idx:x+1]) 为句子中最后第二位到末尾的值，取他们的频率，如果没有取1
+           # 以{idx:(概率对数最大值，词语末字位置)}键值对形式保存在route中
            # route[x+1][0] 表示 词路径[x+1,N-1]的最大概率对数,
            # [x+1][0]即表示取句子x+1位置对应元组(概率对数，词语末字位置)的概率对数
-            route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
-                              logtotal + route[x + 1][0], x) for x in DAG[idx])
+            route[idx] = max(
+                            (log(self.FREQ.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) 
+                            for x in DAG[idx]           #为已idx为开始，x结束的片段
+                            )
 
     # DAG中是以{key:list,...}的字典结构存储
     # key是字的开始位置
+    # 输入的是一段文本
     def get_DAG(self, sentence):
+        """
+        双重遍历句子，出现在频率字典中的词语挑出，形成节点被按顺序赋予到DAG中
+        DAG：把前进的顺序当作主干，链接的词语就是叶子，但叶子可以被主干的不同地方多次链接
+            当作图来看就是DAG
+        """
         self.check_initialized()
         DAG = {}
         N = len(sentence)
-        for k in xrange(N):
+        for k in xrange(N): #依次遍历文本中的每个位置，K为第一下标
             tmplist = []
-            i = k
-            frag = sentence[k]
-            while i < N and frag in self.FREQ:
-                if self.FREQ[frag]:
-                    tmplist.append(i)
+            i = k           #设置I默认为K，第二下标
+            frag = sentence[k]  #设置片段开始，下标为K
+            while i < N and frag in self.FREQ:  #判断该片段是否在前缀词典中，如果不在则跳出，说明该片段已经超出该词的长度
+                if self.FREQ[frag]:     #如果该片段大于零，即无前缀，加入DAG中，否则继续循环
+                    tmplist.append(i)   
                 i += 1
-                frag = sentence[k:i + 1]
-            if not tmplist:
+                frag = sentence[k:i + 1]    #新的片段较左边加一个字
+            if not tmplist: #如果都有前缀，压入下标本身（适用于字典末尾?
                 tmplist.append(k)
-            DAG[k] = tmplist
+            DAG[k] = tmplist    #默认赋予的是词语的下标开始，值为已I结尾的词语
         return DAG
 
     def __cut_all(self, sentence):
+        """
+        详细切分文本中的句子
+        """
         dag = self.get_DAG(sentence)
         old_j = -1
-        for k, L in iteritems(dag):
-            if len(L) == 1 and k > old_j:
-                yield sentence[k:L[0] + 1]
-                old_j = L[0]
+        for k, L in iteritems(dag): #返回下标和列表 （迭代器类型
+            if len(L) == 1 and k > old_j:   #如果是是单字
+                yield sentence[k:L[0] + 1]  #L[0]为K，返回单字
+                old_j = L[0]                #重新设置标签
             else:
-                for j in L:
+                for j in L:                 #如果不是单字，挨个返回
                     if j > k:
                         yield sentence[k:j + 1]
                         old_j = j
 
     def __cut_DAG_NO_HMM(self, sentence):
+        """
+        利用已有的字典组成DAG，然后动态规划行成最大概率切分
+        """
         DAG = self.get_DAG(sentence)
         route = {}
         self.calc(sentence, DAG, route)
@@ -281,7 +321,8 @@ def __cut_DAG(self, sentence):
             else:
                 for elem in buf:
                     yield elem
-   #jieba分词的主函数,返回结果是一个可迭代的 generator
+
+    #jieba分词的主函数,返回结果是一个可迭代的 generator
     def cut(self, sentence, cut_all=False, HMM=True):
         '''
         The main function that segments an entire sentence that contains
@@ -327,6 +368,7 @@ def cut(self, sentence, cut_all=False, HMM=True):
                         yield x
 
     # 分词的(HMM or no HMM)的基础上，对长词再次切分
+    # 利用字频
     def cut_for_search(self, sentence, HMM=True):
         """
         Finer segmentation for search engines.
@@ -356,6 +398,7 @@ def lcut_for_search(self, *args, **kwargs):
     _lcut = lcut
     _lcut_for_search = lcut_for_search
 
+    #返回list类型
     def _lcut_no_hmm(self, sentence):
         return self.lcut(sentence, False, False)
 
@@ -368,6 +411,7 @@ def _lcut_for_search_no_hmm(self, sentence):
     def get_abs_path_dict(self):
         return _get_abs_path(self.dictionary)
 
+    #加载用户字典
     def load_userdict(self, f):
         '''
         Load personalized dict to improve detect rate.
@@ -404,6 +448,7 @@ def load_userdict(self, f):
 
     def add_word(self, word, freq=None, tag=None):
         """
+        增加字到字典，freq与tag可省略
         Add a word to dictionary.
         freq and tag can be omitted, freq defaults to be a calculated value
         that ensures the word can be cut out.
@@ -426,6 +471,7 @@ def del_word(self, word):
         """
         self.add_word(word, 0)
 
+    #建议频率
     def suggest_freq(self, segment, tune=False):
         """
         Suggest word frequency to force the characters in a word to be
@@ -455,6 +501,7 @@ def suggest_freq(self, segment, tune=False):
             add_word(word, freq)
         return freq
 
+
     def tokenize(self, unicode_sentence, mode="default", HMM=True):
         """
         Tokenize a sentence and yields tuples of (word, start, end)
@@ -496,6 +543,7 @@ def set_dictionary(self, dictionary_path):
             self.initialized = False
 
 
+# 
 # default Tokenizer instance
 
 dt = Tokenizer()
@@ -566,6 +614,7 @@ def _pcut_for_search(sentence, HMM=True):
 
 def enable_parallel(processnum=None):
     """
+    #开启多线程版本?
     Change the module's `cut` and `cut_for_search` functions to the
     parallel version.
     Note that this only works using dt, custom Tokenizer
@@ -587,6 +636,7 @@ def enable_parallel(processnum=None):
 
 
 def disable_parallel():
+    #禁用多线程？
     global pool, dt, cut, cut_for_search
     if pool:
         pool.close()

diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
@@ -1,3 +1,4 @@
+"""default analyse"""
 from __future__ import absolute_import
 from .tfidf import TFIDF
 from .textrank import TextRank

diff --git a/jieba/analyse/textrank.py b/jieba/analyse/textrank.py
@@ -11,17 +11,23 @@
 
 
 class UndirectWeightedGraph:
+    """
+    无向直接加权图
+    """
     d = 0.85
 
     def __init__(self):
         self.graph = defaultdict(list)
 
-    def addEdge(self, start, end, weight):
+    def addEdge(self, start, end, weight): 
         # use a tuple (start, end, weight) instead of a Edge object
         self.graph[start].append((start, end, weight))
         self.graph[end].append((end, start, weight))
 
     def rank(self):
+        """
+
+        """
         ws = defaultdict(float)
         outSum = defaultdict(float)
 
@@ -55,6 +61,10 @@ def rank(self):
 
 
 class TextRank(KeywordExtractor):
+    """
+
+
+    """
 
     def __init__(self):
         self.tokenizer = self.postokenizer = jieba.posseg.dt
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,5 +10,4 @@ Have fun!</br> @@
     Daniel Wang</br>
 年 8 月 23 日</br>
-    ------------------------------
+    ------------------------------