-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleanup.py
More file actions
executable file
·62 lines (44 loc) · 1.96 KB
/
data_cleanup.py
File metadata and controls
executable file
·62 lines (44 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
import json
HIERARCHY_DATA_PATH = "frontend/src/app/services/dialect_hierarchy.json"
HIERARCHY_FILTERED_DATA_PATH = "frontend/src/app/services/dialect_hierarchy_filtered.json"
QUESTIONNAIRE_DATA_PATH = "frontend/src/assets/cleaned_translation_questions.json"
JUDGMENTS_DATA_PATH = "frontend/src/assets/likert_scales_merged.json"
# create separate dialect hierarchies for questionnaires and judgments:
# they don't all have the same dialects
hierarchy_data = json.load(open(HIERARCHY_DATA_PATH))
lookup = set() # dialects in dataset
questionnaire_data = json.load(open(QUESTIONNAIRE_DATA_PATH))
for question in questionnaire_data.values():
for answer in question['answers']:
for dialect in answer['dialect']:
lookup.add(dialect)
matched = set() # dialects in hierarchy
def filter_hierarchy(source, lookup, matched):
target = {}
for key, value in source.items():
filtered_children = filter_hierarchy(value, lookup, matched)
if key in lookup or len(filtered_children):
# add to hierarchy if it or any of its children
# are in the data
matched.add(key)
target[key] = filtered_children
return target
result = {
'question': filter_hierarchy(hierarchy_data, lookup, matched)
}
for dialect in lookup:
if dialect not in matched:
print(f"dialect in questionnaire data missing from hierarchy: {dialect}")
judgment_data = json.load(open(JUDGMENTS_DATA_PATH))
lookup = set() # dialects in dataset
for judgment in judgment_data.values():
for response in judgment['responses']:
for dialect in response['dialects']:
lookup.add(dialect)
matched = set() # dialects in hierarchy
result['judgment'] = filter_hierarchy(hierarchy_data, lookup, matched)
for dialect in lookup:
if dialect not in matched:
print(f"dialect in judgment data missing from hierarchy: {dialect}")
json.dump(result, open(HIERARCHY_FILTERED_DATA_PATH, "w"), indent=4)