-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_figures_maths_lit.py
More file actions
64 lines (54 loc) · 2.49 KB
/
extract_figures_maths_lit.py
File metadata and controls
64 lines (54 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
'''
This code does the following:
Extracts the figure attributions and puts them in a list
'''
import os
from lxml import etree
path = '/home/heather/Desktop/books/grade-10-lifescience-latex/english'
# create all the lists that we need
figure_attribution_list = []
count = 0
# loop over the files in the directory
for file_name in os.listdir(path):
full_file_name = '{}/{}'.format(path, file_name)
# Skip directories
if os.path.isdir(full_file_name):
continue
# now we have another issue: the directory does not only contain xml files, we need to remove those that do not contain xml and those that do not start with a number.
if file_name[-9:] != 'cnxmlplus':
continue
if file_name[0] not in ['0', '1', '2', '3']:
continue
xml = etree.XML(open(full_file_name, 'r').read())
for fig_attribution in xml.findall('.//attribution'): # find all the attributions
count += 1
print count
if fig_attribution != None:
fig_attribution_title = fig_attribution.find('.//title')
if fig_attribution_title != None:
fig_attribution_text = fig_attribution_title.text
else:
fig_attribution_text = 'No title'
fig_attribution_author = fig_attribution.find('.//author')
if fig_attribution_author != None:
try:
fig_attribution_text = fig_attribution_text + ' by {}'.format(fig_attribution_author.text)
except:
continue
else:
fig_attribution_text = fig_attribution_text + ' by anonymous'
fig_attribution_licence = fig_attribution.find('.//licence')
if fig_attribution_licence != None:
fig_attribution_text = fig_attribution_text + ' under {} licence'.format(fig_attribution_licence.text)
else:
fig_attribution_text = fig_attribution_text + ' under unknown licence'
fig_attribution_url = fig_attribution.find('.//url')
if fig_attribution_url != None:
fig_attribution_text = fig_attribution_text + ' at {}'.format(fig_attribution_url.text)
else:
fig_attribution_text = fig_attribution_text + ''
figure_attribution_list.append(fig_attribution_text)
#figure_attribution_list.sort()
# write the contents of each dictionary and list to a file
with open('gr10-lifescience-attributions', 'w') as file:
file.write(str(figure_attribution_list))