PythonForResearch/homework4_case_study_2_hamlet_translation.py at master · shhschilling/PythonForResearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#------------------------------------------------
# Homework 4 of course "Python for Research"
# HarvardX -  PH526xEdX, EdX
# Hamlet translation
#------------------------------------------------

#Exercise 1--------------------------------------
import pandas as pd
hamlets = pd.DataFrame(columns=["language","text"])## Enter code here! ##
book_dir = "Books"
title_num = 1
for language in book_titles:
    for author in book_titles[language]:
        for title in book_titles[language][author]:
            if title=='Hamlet':
                inputfile = data_filepath+"Books/"+language+"/"+author+"/"+title+".txt"
                text = read_book(inputfile)
                hamlets.loc[title_num] = language, text
                title_num += 1
#Exercise 2--------------------------------------
language, text = hamlets.iloc[0]
def count_words_fast(text):
    import collections
    text=text.lower()
    skips=[".",";",",","!","?","''",'""',"%","$"]
    for char in skips:
        text=text.replace(char,"")
    word_counts=collections.Counter(text.split(" ")) #type: collections.Counter
    return(word_counts)

language, text = hamlets.iloc[0]

counted_text = count_words_fast(text)

data = pd.DataFrame({
    "word": list(counted_text.keys()),
    "count": list(counted_text.values())
})


#Exercise 3--------------------------------------

data["length"]=list(map(len,data["word"]))
column_word=data["word"]

def word_frequency(count):
    if (count>10):
        frequency="frequent"
    elif(1<count<=10):
        frequency="infrequent"
    else:
        frequency="unique"
    return(frequency)


data["frequency"]=list(map(word_frequency,data["count"]))

#Exercise 4--------------------------------------
language, text = hamlets.iloc[0]
counted_text = count_words_fast(text)

data = pd.DataFrame({
    "word": list(counted_text.keys()),
    "count": list(counted_text.values())
})

data["length"] = data["word"].apply(len)

data.loc[data["count"] > 10,  "frequency"] = "frequent"
data.loc[data["count"] <= 10, "frequency"] = "infrequent"
data.loc[data["count"] == 1,  "frequency"] = "unique"


sub_data=pd.DataFrame({"language": hamlets["language"]})