python/27.py at master · yuangang123/python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#利用马尔科夫模型
#生成文字生成器
from urllib.request import urlopen
from random import randint

#计算wordList的value和
def wordListSum(wordList):
    sum=0
    for word,value in wordList.items():
        sum += value
    return sum
#按照词语的概率选取最合适单词
#慢慢体会有道理
#例如
#{word_a:{word_b:1,word_c:2,word_d:1},
 #word_e:{word_b:5,word_d:2,...}
 #}
 #对于单词a，和为4
 #随机数可以为1,2,3,4
 #其中1,2,的时候，会选择单词b，
 #为3的时候会选择单词c
 #为1的时候会选择单词d
 #和单词的频率是一致的

#这种方法按照权重选取随机数，思想第一次接触，值得记下来，说不定以后有用
def retrieveRandomWord(wordList):
    randIndex= randint(1,wordListSum(wordList))
    for word,value in wordList.items():
        randIndex -=value
        if randIndex <=0:
            return word
#创建形如如下的二维字典
#{word_a:{word_b:2,word_c:1,word_d:1},
 #word_e:{word_b:5,word_d:2,...}
 #}
def buildWordDict(text):
    text = text.replace("\n"," ")
    text = text.replace("\"","")

    punctuation = [',','.',';',':']
    for symbol in punctuation:
        text = text.replace(symbol," "+symbol+" ")

    words = text.split(" ")
    words =[word for word in words if word != ""]

    wordDict = {}

    for i in range(1,len(words)):
        if words[i-1] not in wordDict:
            wordDict[words[i-1]]={}
        if words[i] not in wordDict[words[i-1]]:
            wordDict[words[i-1]][words[i]] =0
        wordDict[words[i-1]][words[i]] =wordDict[words[i-1]][words[i]]+1

    return wordDict

text =str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),"utf-8")
wordDict = buildWordDict(text)

for item in wordDict.items():
    print(item)

length=100
chain=""
currentWord = "I"
for i in range(0,length):
    chain+=currentWord+" "
    currentWord=retrieveRandomWord(wordDict[currentWord])

print(chain)