-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranscripts_1_documents.py
More file actions
89 lines (56 loc) · 2.13 KB
/
transcripts_1_documents.py
File metadata and controls
89 lines (56 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests, json, re
if __name__ == "__main__":
transcripts = json.load(open('data/transcripts_db_dump.json'))
transcripts_counts = json.load(open('data/transcripts_counts.json'))
authority = json.load(open('data/authority_cleaned.json'))
missing_interviewers = []
for t in transcripts:
# find the interviewee in the authority, make sure it is there, add the slug
for a in authority:
if t['intervieweeURI'] == a['uri']:
t['ljSlug'] = a['ljSlug']
t['interviewers'] = re.sub('\s+',' ',t['interviewers'])
t['interviewers'] = t['interviewers'].replace(', ',',')
t['interviewers'] = list(set(t['interviewers'].split(',')))
t['interviewers_ljSlug'] = []
for i in t['interviewers']:
slug = i.replace(' ','_')
for a in authority:
if slug == a['ljSlug']:
t['interviewers_ljSlug'].append(slug)
if slug not in t['interviewers_ljSlug']:
if i != '':
missing_interviewers.append(i)
# we are going to do this again, creating the missing ones
t['interviewers_ljSlug'] = []
for c in transcripts_counts:
if c['transcript'] == t['md5']:
t['totalPairs'] = c['totalPairs']
t['totalResponse'] = c['totalResponse']
# build data for interviewers
built_interviewers = []
for i in list(set(missing_interviewers)):
slug = i.replace(' ','_')
built_interviewers.append({"name":i, "ljSlug":slug, "type":"interviewer"})
orgs = []
for t in transcripts:
for i in t['interviewers']:
if i == '':
continue
slug = i.replace(' ','_')
for a in authority:
if slug == a['ljSlug']:
t['interviewers_ljSlug'].append(slug)
for a in built_interviewers:
if slug == a['ljSlug']:
t['interviewers_ljSlug'].append(slug)
# make sure
assert slug in t['interviewers_ljSlug']
if t['sourceName'] not in orgs:
orgs.append(t['sourceName'])
built_orgs = []
for o in orgs:
built_orgs.append({"name":o.replace('_',' '),"ljSlug":o})
json.dump(transcripts,open('data/transcripts_documents.json','w'),indent=2)
json.dump(built_interviewers,open('data/transcripts_interviewers.json','w'),indent=2)
json.dump(built_orgs,open('data/transcripts_orgs.json','w'),indent=2)