-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathentity_analysis.py
More file actions
165 lines (139 loc) · 5.52 KB
/
entity_analysis.py
File metadata and controls
165 lines (139 loc) · 5.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import gzip
import time
import pandas as pd
import pandas as pd
import folium as fo
import sys
DUMP_TRUTHY = 'latest-truthy.nt.gz'
TOTAL_ENTITIES_TSV = "data/entities_total.tsv"
ERRORS_TSV = "data/errors.tsv"
def convert(seconds):
return time.strftime("%H:%M:%S", time.gmtime(seconds))
def get_entities_coords(n):
"""
Esta función se encarga de
"""
start = time.time()
n_entities = 0
n_errors = 0
with gzip.open(DUMP_TRUTHY, 'rt', encoding='utf8') as f, open("entities.tsv", 'w') as entities, open(errors_fn, 'w') as errors:
print('Reading file...')
entities.write('entity_url\tlatitude\tlongitude\n')
for line in f:
if n==n_entities: break
if '</P625>' in line:
try:
n_entities += 1
split = line.split(' ')
entity = split[0].split('Q')[1][:-1] # '<https://www.wikidata.org/wiki/Q5>' -> 5
# Las sgtes lineas rescatan las coord pero se piensa hacer mejor con regex
lon = split[2].split('(')[1]
lat = split[3].split(')"^^')[0]
entities.write('{}\t{}\t{}\n'.format(entity, lat.replace('.',','), lon.replace('.',',')))
print(f'Found {n_entities} entities with {n_errors} errors', end="\r")
except:
n_error += 1
n_entities -= 1
errors.write(line)
end = time.time()
print('\nTime working: {}'.format(convert(end - start)))
def entities_dict():
print("* Getting pair {entity:1} dictionary.")
start = time.time()
D = {}
n_entities = 0
with open(TOTAL_ENTITIES_TSV, "r") as f:
for line in f:
key, _, _ = line.split() # entity url, key, and geo coords (lat, lon)
D[key] = 1
n_entities += 1
print(f"Number of entitites added to dict: {n_entities}", end="\r")
end = time.time()
print(f"\nTime working getting the dict: {convert(end-start)}")
return D
"""
Leer el dum truthy completo y si el 1er termino está en el diccionario
de entidades (P625), y si el 2do termino es http://www.wikidata.org/prop/direct/P31
Esta función retorna un diccionario que contiene la frecuencia de los tipos de
entidades del dataset (?)
"""
def count_types():
start = time.time()
d = entities_dict()
types_dict = {}
with gzip.open(DUMP_TRUTHY, 'rt', encoding='utf8') as f:
print("* Reading dump truthy...")
for line in f:
if '/P31>' in line:
split = line.split(' ')[:-1] # ignore last element ("\n") from the list
entity = split[0] # entity url
if entity in d:
e_types = split[2:] # list of the types of the entity
for et in e_types: # iterate over the entity types. can be more than one per entity
if et in types_dict:
types_dict[et] += 1
else:
types_dict[et] = 0 # base case
end = time.time()
print(f"\nTime working getting the types count dict: {convert(end-start)}")
return types_dict
def get_types_count():
start = time.time()
d = count_types()
n_type = 0
print('Adding types and count in tsv file.\n')
with open("data/entity_types.tsv", "w", encoding='utf8') as t, open("data/entity_types_errors.tsv", "w", encoding="utf8") as e:
t.write("entity_type\tcount\n")
for k in d:
try:
type_key = k.split('Q')[1][:-1] # acá hay problemas
t.write(f'{type_key}\t{d[k]}\n')
n_type += 1
print(f'Type {n_type} added to the file', end='\r')
except:
e.write(type_key+"\n")
end = time.time()
print(f'\nTime working getting the tsv file of the types count: {convert(end-start)}')
def types_count_to_md():
df = pd.read_csv("data/entity_types.tsv", sep="\t")
sort_df = df.sort_values(by='count', ascending=False)
sort_df['entity_type'] = 'https://www.wikidata.org/wiki/Q'+sort_df['entity_type'].astype(str)
sort_df['title_label'] = [get_title_label(u) for u in sort_df['entity_type']]
print(sort_df[:25].to_markdown(index=False))
def types_count_to_html():
df = pd.read_csv("data/entity_types.tsv", sep="\t")
sort_df = df.sort_values(by='count', ascending=False)
sort_df['entity_type'] = 'https://www.wikidata.org/wiki/Q'+sort_df['entity_type'].astype(str)
# sort_df.reset_index(drop=True)
sort_df[:25].to_html('entity_list.html', index=False)
def world_map(n):
start = time.time()
print("Reading total of entities...")
data = pd.read_csv('results/entities_total.tsv', sep='\t')
noe = 0
# Creating map
m = fo.Map(
tiles="CartoDB dark_matter",
location=[0, 0],
zoom_control=False,
prefer_canvas=True,
zoom_start=2
)
lat = data['latitude']
lon = data['longitude']
for a, b in zip(lat, lon):
if noe == n: break
fo.CircleMarker(
location=[float(a.replace(',', '.')), float(b.replace(',', '.'))],
color="#fc2ac1",
fill=False,
weight=0.3,
radius=0.5
).add_to(m)
noe += 1
print(f'{str(noe)} entities added.', end='\r')
print('\nSaving html file...')
m.save("map.html")
end = time.time()
print(f'Time working: {convert(end-start)} (h:m:s)')
world_map(100)