frd-reporting/utils.py at main · freud-digital/frd-reporting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import requests
import time
import pandas as pd
from config import FWF_I
from freud_api_crawler import freud_api_crawler as frd
from freud_api_crawler.string_utils import always_https


def yield_manifestation():
    """ yields jsonapi output from manifestations indcluding

        :param url: The API-endpoint
        :param type: string

        :param simple: If True a processed dict is returned, otherwise the full data object
        :param type: bool

        :return: Yields a dict
        """
    next_page = True
    url = f"{frd.FRD_API}node/manifestation?filter[field_doc_component.id]={frd.FULL_MANIFEST}&filter[field_manifestation_typ.id]={frd.HISTORISCHE_AUSGABE}&include=field_werk"  # noqa: E501
    while next_page:
        print(url)
        response = None
        result = None
        x = None
        time.sleep(0.07)
        response = requests.get(
            url,
            cookies=frd.AUTH_ITEMS['cookie'],
            allow_redirects=True,
        )
        result = response.json()
        links = result['links']
        if links.get('next', False):
            orig_url = links['next']['href']
            url = always_https(orig_url)
        else:
            next_page = False
        mans = []
        works = []
        for x in result['data']:
            try:
                if x['attributes']['field_status_umschrift']:
                    umschrift = x['attributes']['field_status_umschrift']
                else:
                    umschrift = 0
                man = {
                    'man_id': x['id'],
                    'man_title': x['attributes']['title'],
                    'man_field_status_umschrift': umschrift,
                    'man_sig': x['attributes']['field_signatur_sfe_type'],
                    'man_created': x['attributes']['created'],
                    'man_changed': x['attributes']['changed'],
                    'man_pages': len(x['relationships']['field_seiten']['data']),
                    'man_chapters': len(x['relationships']['field_chapters']['data']),
                    'man_website_url': f"https://www.freud-edition.net{x['attributes']['path']['alias']}",
                    'work_id': x['relationships']['field_werk']['data']['id']
                }
                mans.append(man)
            except:  # noqa: E722
                continue
        for x in result['included']:
            try:
                work = {
                    'work_id': x['id'],
                    'work_title': x['attributes']['title'],
                    'work_created': x['attributes']['created'],
                    'work_changed': x['attributes']['changed'],
                    'werk_signatur_id': x['relationships']['field_signatur_sfe']['data']['id'],
                    'werk_website_url': f"https://www.freud-edition.net{x['attributes']['path']['alias']}",
                }
                works.append(work)
            except:  # noqa: E722
                continue
        man_df = pd.DataFrame(mans)
        work_df = pd.DataFrame(works)
        merged = pd.merge(man_df, work_df, on='work_id')
        for record in merged.to_dict(orient='records'):
            yield(record)


def yield_werk_signaturs():
    """ yields jsonapi output from manifestations indcluding

        :param url: The API-endpoint
        :param type: string

        :param simple: If True a processed dict is returned, otherwise the full data object
        :param type: bool

        :return: Yields a dict
        """
    next_page = True
    url = f"{frd.FRD_API}taxonomy_term/signatur_fe?fields[taxonomy_term--signatur_fe]=name"
    while next_page:
        print(url)
        response = None
        result = None
        x = None
        time.sleep(0.3)
        response = requests.get(
            url,
            cookies=frd.AUTH_ITEMS['cookie'],
            allow_redirects=True,
        )
        result = response.json()
        links = result['links']
        if links.get('next', False):
            orig_url = links['next']['href']
            url = always_https(orig_url)
        else:
            next_page = False
        for x in result['data']:
            item = {
                'werk_signatur_id': x['id'],
                'werk_signatur': x['attributes']['name']
            }
            yield(item)


def fwf_col(row):
    year = row['werk_signatur'][:4]
    if year in [str(x) for x in FWF_I]:
        value = True
    else:
        value = False
    return value