Tweet-generator/Generator_from_csv.py at main · corentinpla/Tweet-generator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pandas as pd
import numpy as np
import random
import math
from numpy.random import choice

def preprocessing(df): #delete every response and tweets containing an internet link
    index_to_drop=[]
    for k in df.index:
        s=df.at[k,"Tweet"]
        if ("@" in s) or ("http" in s):
            index_to_drop.append(k)

    df=df.drop(index_to_drop)
    df=df.drop(columns="Unnamed: 0")
    return(df)

def split_tweets(df): #df : tweets DataFrame
    words=[] #contains every words from every tweet on the training set

    for tweet in df["Tweet"]:
        for x in tweet.split():
            if "https" not in x: #we delete every link to websites
                words.append(x)

    return(words)

def get_weight():
    df=pd.DataFrame(columns = ['l1+l2','next','l1+l2+next','freq'],dtype=object)
    #l1+l2 : all the words and its following on the training sample
    #follow : df.at[i,'next'] is the word following df.at[i,'l1+l2'] in the training sample
    #freq : the frequency with which "next" follow l1+l2
    return(df)


def get_columns(words,df): #df : DataFrame as returned by get_weight, words, list as returned by split_tweets
    wordsl1l2=[]

    #l1+l2 :
    for k in range(1,len(words)):
        wordsl1l2.append(words[k-1]+' '+words[k])

    wordsl1l2.append(words[0]+' '+words[-1])
    df['l1+l2']=wordsl1l2

    #next :
    df['next'] = words[2:]+[words[0]]+[words[1]]

    #l1+l2+next :
    wordsl1l2next=[]

    for k in range (2,len(words)):
        wordsl1l2next.append(words[k-2]+' '+words[k-1]+' '+words[k])

    wordsl1l2next.append(words[len(words)-2]+' '+words[len(words)-1]+' '+words[0])
    wordsl1l2next.append(words[len(words)-1]+' '+words[0]+' '+words[1])

    df['l1+l2+next']=wordsl1l2next

    #freq :
    freq = df["l1+l2+next"].value_counts()

    M=[]

    df = df.drop_duplicates() #on supprime les doublons dans notre table

    for k in df["l1+l2+next"]:
        M.append(freq[k])

    df["freq"]=M

    return(df)

def get_end_words(L): #L : string.split()
    # return every words at the end of a sentence in the train set
    end_words=[]

    for word in L:

        if word!= "" and word[-1] in ['.','!','?'] and word != '.':
            end_words.append(word)

    return(end_words)

def get_first_words(df,end_words): #df : tweets dataframe
                                   #end_words : list as returned by get_end_words

    first_words=[] #contains every serie word 1 + word 2 beginning a tweet in the trainig sample

    for tw in df["Tweet"]:

        t=tw.split()

        if len(t)>2:

            if "https" not in t[0] and "https" not in t[1]:

                first_words.append(t[0]+" "+t[1])

            for k in range (2,len(t)-1):

                if t[k]!="" and t[k-1] in end_words:

                    if "https" not in t[k] and "https" not in t[k+1]:

                        first_words.append (t[k]+" "+t[k+1])
    return(first_words)


def get_pivot(df):
#pivot_df.at['follow','l1+l2] gives the number of times the two words "l1+l2" have been followed by "next" in the training set
    pivot_df = df.pivot(index = 'next', columns= 'l1+l2', values='freq')
    pivot_df= pivot_df.fillna(0)
    sum_words = pivot_df.sum() #sum_words vector containing the sum of each column of pivot_df

    for col in pivot_df.columns:
        pivot_df[col]=pivot_df[col].apply(lambda x:x/sum_words[col])


    return(pivot_df)


def make_a_sentence(start,pivot_df,end_words):
    wordl1l2= start
    sentence=wordl1l2.split()

    while len(sentence) < 40:

        wordl1l2=sentence[-2]+' '+sentence[-1]
        print(len(words))
        print(len(list(pivot_df[wordl1l2])))
        next_word = choice(a = pivot_df.index, p = list(pivot_df[wordl1l2])) #random choice in the column weighted by the probabilities

        if next_word in end_words:


            if len(sentence) > 2:

                sentence.append(next_word)
                break
            else :
                continue
        else :

            sentence.append(next_word)
        word=sentence[-1]+' '+next_word
    sentence = ' '.join(sentence)
    return sentence