Intention_Chatbot/control.py at main · hannahchiu/Intention_Chatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import re
import json
import pandas as pd
import numpy as np


def read_json(filename):
    '''Read in a json file.'''
    with open(filename, 'r') as json_file:
        data = json.load(json_file)
    return data

class Controller(object):
    """docstring for Controller"""
    def __init__(self, intent_pattern, entity_info, styleme_new, effect2ids, resp_info):
        super(Controller, self).__init__()
        self.intent_pattern = read_json(intent_pattern)
        self.entity_info = read_json(entity_info)
        self.item_info = pd.read_csv(styleme_new, sep='\t')        # styleme_new.tsv
        self.effect2ids = read_json(effect2ids)                    # effect -> all items ids belonging to the effect
        self.resp_info = read_json(resp_info)                      # 我寫的response (目前是直接回list裡的第一個)

        self.regex = {}
        self.prepare_regex()
        self.num_rand_product = 5
        self.neccess_info = [1, 2, 4, 6]  # brand, chinese_name, item (item_type), image
        self.prepare_item_regex()

    def prepare_item_regex(self):
        self.items_regex = {}
        # self.entity_info['item'][0] -> all possible item type
        for l in self.entity_info['item'][0]:
            str_list = []
            for item_name in self.entity_info[l][1]:  # all alias of that type
                str_list.append('(' + item_name + ')')
            self.items_regex[l] = '|'.join(str_list)  # combine them into a regex, so we can match any alias

        self.brands_regex = {}
        for l in self.entity_info['brand'][0]: # -> all possible brand
            str_list = []
            for brand_name in self.entity_info[l][1]:  # all alias of that brand
                str_list.append('(' + brand_name + ')')
            self.brands_regex[l] = '|'.join(str_list)  # combine them into a regex, so we can match any alias

        self.effects_regex = {}
        for l in self.entity_info['effect'][0]:
            str_list = []
            for effect_name in self.entity_info[l][1]:
                str_list.append('(' + effect_name + ')')
            self.effects_regex[l] = '|'.join(str_list)

    def prepare_regex(self):
        # loop thru every items
        find_paren_exp = r'\(([\w\|]*)\)'

        for intent, p_list in self.intent_pattern.items():
            self.regex[intent] = []
            for p in p_list:
                # for every pattern in p_list, we replace the placeholders with real value
                # and compile it into a compiled regex
                pattern = p
                m = re.findall(find_paren_exp, p)  # all (...|...|...) in the regex
                if m:
                    for subreg in m:  # for every (...|...|...)
                        # if subreg[:5] == 'brand' or subreg[:4] == 'name':
                        #     continue

                        # subereg_expand represents the new subreg after we replace the placeholders
                        subreg_expand = subreg
                        # handle 2 same placeholder in one regex
                        if subreg_expand[-2:] == '_1' or subreg_expand[-2:] == '_2':
                            subreg_expand = subreg_expand[:-2]

                        # get all entity in subreg
                        ent_list = subreg_expand.split('|') # every ... between | is an entity
                        for ent in ent_list:
                            if ent[:4] == 'name':         # do not handle names (too many of them, time consuming)
                                continue
                            elif ent[:5] == 'brand':      # replace 'brand' with all possible brands
                                replace_str = self.check_item_brand(ent)
                            else:
                                replace_str = self.check_item(ent)  # replace 'item' with all possible item types
                            subreg_expand = subreg_expand.replace(ent, replace_str)  # replace entities

                        # replace every subreg with expanded subreg
                        pattern = pattern.replace(subreg, subreg_expand)

                try:
                    compiled = re.compile(pattern)
                except:  # if sth goes wrong -> gg
                    print(pattern)
                    exit(0)

                # append all regex into a list, later compare it to input command 1 by 1
                self.regex[intent].append(compiled)

    def check_item(self, item):
        '''
            反覆去check此sense是否有下位的sense(可以用更細的概念取代)
            以及他是否有同義詞，把所有這些都拿進來變成一個regex
            確保所有符合此概念的詞都可以被match
        '''
        try:     # the first list are possible senses, terms are similar meaning words
            senses, terms = self.entity_info[item]
        except:  # if not a key in entity_info -> means it is a item (reach the bottom)
            return item
        sense_str_list = []

        terms = ['('+l+')' for l in terms]
        term_str = '|'.join(terms)  # combine all terms into a regex
        if senses:
            for s in senses:        # if the first list not empty -> need to check those senses
                sense_str = self.check_item(s)
                sense_str_list.append(sense_str)
            if terms:
                sense_str_list.append(term_str)
            return '|'.join(sense_str_list)  # return sense str and term str combined
        else:                       # if first list empty -> only return term str
            return term_str

    def check_item_brand(self, item):
        '''
            same as check item
            but add escape char to all special chars
        '''
        try:
            senses, terms = self.entity_info[item]
        except:
            return item
        sense_str_list = []

        terms = ['('+l.replace('+', '\+').replace('.', '\.').replace('*', '\*').replace('(', '\(').replace(')', '\)').replace('=', '\=')+')' for l in terms]
        term_str = '|'.join(terms)
        if senses:
            for s in senses:
                sense_str = self.check_item_brand(s)
                sense_str_list.append(sense_str)
            if terms:
                sense_str_list.append(term_str)
            return '|'.join(sense_str_list)
        else:
            return term_str

    def check_intent(self, cmd):
        match_str = ""      # the string matching the specified regex
        match_intent = None # the matched intention
        match_idx = 0       # the index of matched pattern in an intention
                            # (need it because we need to know whether user specify effect, brand or item)

        get_item = False    # whether display recommended item for users
        items = []          # retrieved item list,
        for intent, reg_list in self.regex.items():  # enumerate thru all intent
            for i, reg in enumerate(reg_list):       # enumerate thru all pattern in a intent
                search_str = re.search(reg, cmd)
                if search_str:  # if the command match reg
                    # here we want to find the longest matched string, so we actually compare to all possible patterns
                    # and then stay with the regex with longest match
                    if len(search_str.group(0)) > len(match_str):
                        match_str = search_str.group(0)
                        match_intent = intent
                        match_idx = i
        if match_str == "":  # after search all possible patterns in all intent -> still no match
            return "nomatch", "NO PATTERNS FOUND...", False, [], ""
        else:                # match to sth
            print('match_str', match_str)
            if match_intent == 'search_item':
                get_item = True

                # retrieve items from styleme_new.tsv
                items = self.get_items(cmd, match_idx not in [5, 6, 17], match_idx in [7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19])
                print('items', items)
                items_proc = self.process_item(items)  # some postprocessing
                print('processed', items_proc)
                items = items_proc                     # return processed items

            return match_intent, self.intent_pattern[match_intent][match_idx], get_item, items, self.resp_info[match_intent][match_idx][0]

    def control(self, cmd):
        cmd_string, pattern_string, get_item, items, resp_string = self.check_intent(cmd)
        return cmd_string, pattern_string, get_item, items, resp_string

    def get_items(self, cmd, specify_item, brand_or_effect):
        items = self.item_info  # styleme_new.tsv
        if specify_item:        # 某種美妝用品，如化妝水等
            item_list = []
            if brand_or_effect: # 除了種類之外，還要求某種功效或品牌
                for effect, regex in self.effects_regex.items():
                    search_str = re.search(regex, cmd)

                    if search_str:  # means belong to an effect, e.g. 保濕
                        ids = self.effect2ids[effect]  # get all item indices with the effect
                        items = items.iloc[ids]        # get all item belonging to the effect
                        print('effect', effect)
                        break

                for brand, regex in self.brands_regex.items():
                    search_str = re.search(regex, cmd)

                    if search_str:  # means belong to a brand, e.g. sk2
                        items = items[items['brand'] == brand]  # get all item belonging to the brand
                        print('brand', brand)
                        break


            # check for if item
            for item_type, regex in self.items_regex.items():
                search_str = re.search(regex, cmd)

                if search_str:  # means belong to an item, e.g. 化妝水
                    items = items[items['item'] == item_type]  # get all item belonging to the item type
                    print('item', item_type)
                    break


            return self.random_return(df=items, size=20)

        else:
            return self.random_return()  # if does not specify anything, just random return (self.num_rand_product=5) products

    def random_return(self, df=None, size=None):
        # all random
        if not isinstance(df, pd.DataFrame):
            rand_nums = np.random.randint(self.item_info.shape[0], size=self.num_rand_product)
            items = self.item_info.iloc[rand_nums, self.neccess_info]  # self.neccess_info -> only get neccessary info
            print(items)
            item_list = list(items.to_records(index=False))
            return item_list

        if not size:  # if not specify how many items to display -> 5
            size = self.num_rand_product

        # if specify df
        if df.shape[0] <= size:  # if df not bigger the predefined size
            return list(df.iloc[:, self.neccess_info].to_records(index=False))
        else:
            rand_nums = np.random.randint(df.shape[0], size=size)
            items = df.iloc[rand_nums, self.neccess_info]
            item_list = list(items.to_records(index=False))
            return item_list

    def process_item(self, items):
        if not items:
            return []
        else:
            new_items = []
            for item in items:
                if isinstance(item[1], str) and isinstance(item[3], str):  # only get item with 'chinese name' and 'image'
                    if isinstance(item[0], str): # if have brand name
                        new_items.append((item[0].replace('_', ' '), item[1], item[3]))
                    else:  # if no brand name -> handle NAN problem in pandas
                        new_items.append(('', item[1], item[3]))

            return new_items


ctrl = Controller('app/pattern/intent_pattern.json', 'app/pattern/entity_info.json', 'app/pattern/styleme_new.tsv', 'app/pattern/effect2ids.json', 'app/pattern/response.json')


if __name__ == '__main__':
    ctrl = Controller('app/pattern/intent_pattern.json', 'app/pattern/entity_info.json')
    # ctrl = Controller('app/pattern/intent_pattern.json', 'ent.json')
    for k, v in ctrl.intent_pattern.items():
        print(k)
    # find_paren_exp = r'\(([\w\|]*)\)'

    # for k, p_list in ctrl.intent_pattern.items():
    #     for p in p_list:
    #         if p != "((brand_1)的)?(name)的(price|feeling|color|smell|volume|CP|url|brand_2|pic|listed_time|comment|effect|info|tips|article|texture)(是什麼|如何)":
    #             continue
    #         pattern = p
    #         m = re.findall(find_paren_exp, p)
    #         if m:
    #             # print('---------------------')
    #             # print('pattern', p)
    #             # print(m)
    #             for subreg in m:
    #                 subreg_expand = subreg
    #                 if subreg_expand[-2:] == '_1' or subreg_expand[-2:] == '_2':
    #                     subreg_expand = subreg_expand[:-2]
    #                 print('subreg', subreg_expand)
    #                 ent_list = subreg_expand.split('|')
    #                 for ent in ent_list:
    #                     a = ctrl.check_item(ent)
    #                     subreg_expand = subreg_expand.replace(ent, a)
    #                 pattern = pattern.replace(subreg, subreg_expand)

    #         # print('====')
    #         print('new pattern', pattern)

#