-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path25.py
More file actions
executable file
·56 lines (45 loc) · 1.95 KB
/
Copy path25.py
File metadata and controls
executable file
·56 lines (45 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import OrderedDict
from collections import Counter
def cleanInput(input):
input = re.sub('\n+'," ",input)
input = re.sub('\[[0-9]*\]',"",input)
input = re.sub(' +'," ",input)
input = bytes(input,"utf-8")
input = input.decode('ascii',"ignore")
cleanInput=[]
input = input.split(" ")
for item in input:
#删除指定的字符,string.strip,这里用来删除标点符号
item = item.strip(string.punctuation)
if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
cleanInput.append(item)
return cleanInput
def ngarms(input,n):
input = cleanInput(input)
output = []
for i in range(len(input)-n+1):
output.append(input[i:i+n])
return output
html= urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj= BeautifulSoup(html.read())
content = bsObj.find('div',{'id':'mw-content-text'})
content=content.get_text()
ngramsresult=ngarms(content,2)
#print (Counter(str(i) for i in ngramsresult)) # 以字典形式返回统计结果
#print (Counter(str(i) for i in ngramsresult).items()) # 以列表形式返回统计结果
#
#通过counter()获取了一个带有频率的字典
#
ngramsresult = Counter(str(i) for i in ngramsresult)
# -------------- map方法 --------
#print (Counter(map(str, ngramsresult))) # 以字典形式返回统计结果
#print (Counter(map(str, ngramsresult)).items()) # 以列表形式返回统计结果
#print(ngramsresult)
#因为对字典排序,由于字典内的元素的位置不是固定的,排序之后还是会发生变化,除非把排过序的字典里面的值复制到其他的类型值进行排序,python里面有一个orderddict可以解决这个问题
ngramsresult = OrderedDict(sorted(ngramsresult.items(),key=lambda t:t[1],reverse=True))
print(ngramsresult)
print("2-grams count is:"+str(len(ngramsresult)))