madrona-patch-scraper/scraper.py at main · teddywilson/madrona-patch-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import json
import os
import re
import requests
import time
import urllib.request
from bs4 import BeautifulSoup
from sys import exit

BASE_FORUM_THREAD_URL = "https://madronalabs.com/topics/357-sticky-aalto-patch-thread"

HTML_PATCH_REGEX = re.compile("&lt[;]Aalto.*\/&gt", re.DOTALL)
HTML_PRESET_NAME_REGEX = re.compile('presetName=\\"[^\\"]*\\"')
JSON_PATCH_REGEX = re.compile('{.*}', re.DOTALL)

# Sanity threshold for page fetching
PAGE_INDEX_THRESHOLD = 300

PATCH_FILE_SUFFIX = ".mlpreset"

def fail(message):
    print(message)
    exit(1)


def scrape_patches():
    html_patches = []
    json_patches = []

    page_index = 1
    while True and page_index < PAGE_INDEX_THRESHOLD:
        url = BASE_FORUM_THREAD_URL + "?page=%d" % page_index
        page_index += 1

        response = requests.get(url)
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, "html.parser")
        forum_posts = soup.find_all("div", class_="forum-post")

        # Take into account the sticky post
        post_count = len(forum_posts) - 1
        if post_count == 0:
            break

        print("Fetched " + str(post_count) + " posts on page " + str(page_index - 1) + ".")
        for forum_post in forum_posts:
            html_result = re.findall(HTML_PATCH_REGEX, str(forum_post))
            for html_match in html_result:
                html_patches.append(html_match)

            json_result = re.findall(JSON_PATCH_REGEX, str(forum_post))
            for json_match in json_result:
                json_patches.append(json_match)

    return html_patches, json_patches


def sanitize_html_preset_name(html_preset_name):
    if not html_preset_name.startswith('presetName="'):
        fail('Found html preset name with invalid prefix: %s' % html_preset_name)
    if not html_preset_name.endswith('"'):
        fail('Found html preset name with invalid suffix: %s' % html_preset_name)
    return html_preset_name[12:-1].replace('/', '_')


def sanitize_html_patch(html_patch):
    if not html_patch.startswith('&lt;Aalto'):
        fail('Found html patch with invalid prefix: %s' % html_patch)
    if not html_patch.endswith('/&gt'):
        fail('Found html patch with invalid suffix: %s' % html_patch)
    return "<" + html_patch[4:-3] + ">"


def sanitize_json_preset_name(json_preset_name):
    return json_preset_name.replace('/', '_')


def sanitize_json_patch(json_patch):
    return json_patch.replace('<br/>', '')


def write_patch(output_dir, preset_name, contents):
    print("Writing preset: " + preset_name + ".")
    filename = preset_name + PATCH_FILE_SUFFIX
    f = open(os.path.join(output_dir, filename), "w")
    f.write(contents)
    f.close()


def write_patches_to_output_dir(html_patches, json_patches, output_dir):
    for html_patch in html_patches:
        preset_name = re.search(HTML_PRESET_NAME_REGEX, html_patch)
        if preset_name is None:
            fail('Could not parse html preset name: %s' % match)
        write_patch(
          output_dir,
          sanitize_html_preset_name(preset_name.group()),
          sanitize_html_patch(html_patch)
        )

    for json_patch_str in json_patches:
        json_patch = json.loads(sanitize_json_patch(json_patch_str))
        write_patch(
          output_dir,
          sanitize_json_preset_name(json_patch['preset']),
          json.dumps(json_patch)
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_dir', required=True)
    args = parser.parse_args()

    # Create the output directory if it doesn't already exist
    if not os.path.exists(args.output_dir):
        print('Creating directory %s' % args.output_dir)
        os.makedirs(args.output_dir)

    html_patches, json_patches  = scrape_patches()
    write_patches_to_output_dir(html_patches, json_patches, args.output_dir)