-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
73 lines (57 loc) · 2.15 KB
/
preprocessing.py
File metadata and controls
73 lines (57 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from data_pre import data_merge, clean_data_generation
from s_EDA import EDA
import argparse
import pandas as pd
parser = argparse.ArgumentParser(description='Data Preprocessing')
parser.add_argument('--f',
type=str,
default='user',
help='input column')
parser.add_argument('--s',
type=str,
default='system',
help='output column')
parser.add_argument('--t',
type=str,
default='sentiment',
help='sentiment column')
parser.add_argument('--min-len',
type=int,
default=4,
help='문장 최소 길이')
parser.add_argument('--max-len',
type=int,
default=32,
help='문장 최대 길이')
def preprocessing(args, *data):
# 데이터 머지
df = data_merge(data)
# 데이터 전처리
pre_df = clean_data_generation(df, args)
print('>> Data Preprocessing Done', '\n')
# CSV 생성
import csv
file_name = 'Test1_data.csv'
f = open(f'data/{file_name}', 'a+', encoding='utf-8')
wr = csv.writer(f)
wr.writerow(['user', 'system', 'label'])
input = pre_df[args.f]
output = pre_df[args.s]
sentiment = pre_df[args.t]
# EDA
cnt = 0
for inp, out, sen in zip(input, output, sentiment):
for eda_input in list(set(EDA(inp))):
if len(eda_input) >= args.min_len:
cnt += 1
wr.writerow([eda_input, out, sen])
print(f'>> EDA Data {cnt}개 생성')
print(f'>> {file_name} Data Save Complete')
if __name__ == '__main__':
args = parser.parse_args()
df_large = pd.read_csv('data/[Ai_HUB]chatbot_data_large[34M].csv')
df_small = pd.read_csv('data/[Ai_HUB]chatbot_data_small[2.7M].csv')
df_voucher = pd.read_csv('data/[DATAVoucher]chatbot_data[2M].csv', encoding='cp949')
df_sculpture = pd.read_csv('data/[MrMind]chatbot_data[0.57M].csv')
df_doll = pd.read_csv('data/[MrMind]doll_chat_data[0.85M].csv')
preprocessing(args, df_small, df_voucher, df_sculpture, df_doll)