Kaamelott365/quotes.py at master · BenCoDev/Kaamelott365 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import random
import pickle


class Quotes(object):
    BASE_URL = "http://fr.wikiquote.org"
    MAIN_PAGE_URL = BASE_URL + '/wiki/Kaamelott'

    def __init__(self):
        self.soup = None
        self.characters = []
        self.quotes_per_char = {}

    def pick_random(self):
        random_char_index = random.randrange(0, len(self.characters))
        random_char = self.characters[random_char_index]
        quote_index = random.randrange(0, len(self.quotes_per_char[random_char]))
        quote = self.quotes_per_char[random_char][quote_index]
        return quote.tweetable

    def with_scrape(self):
        """
        Scrape Wikipedia main page about Kaamelott
        :return:
        """
        r = requests.get(self.MAIN_PAGE_URL).text
        self.soup = BeautifulSoup(r)
        return self

    def with_pickle(self):
        """
        Instantiate from a local pickle instead of scraping
        :return:
        """
        with open('quotes.pickle', 'rb') as f:
            quotes_per_char = pickle.load(f)
        self.quotes_per_char = quotes_per_char
        self.characters = list(quotes_per_char.keys())
        return self

    def _load_characters(self):
        [self.characters.append(name.text) for name in self.soup.select('li .toclevel-2 a span.toctext')]
        return self

    def _load_quotes(self):
        for char_span in self.soup.select('h3 span.mw-headline'):
            cur_char = char_span.text
            quote = Quote(cur_char)

            for next_sibling in char_span.parent.next_siblings:
                is_tag = isinstance(next_sibling, Tag)

                if quote.is_complete():
                    self._enqueue_quote(quote)
                    quote = Quote(cur_char)  # instantiate a new quote to be completed

                if not is_tag:
                    continue
                if next_sibling.attrs.get('class') == ['citation']:
                    quote.text = next_sibling.text
                elif next_sibling.select('.ref'):
                    quote.meta = [ref.text for ref in next_sibling.select('.ref')]
                elif next_sibling.name == 'dl':
                    next_url = next_sibling.find('a').get('href')
                    sub_page_request = requests.get(self.BASE_URL + next_url)
                    for quote in self._load_sub_quotes(cur_char, sub_page_request):
                        self._enqueue_quote(quote)
                elif next_sibling.name in ['h3', 'h2']:  # new char or end / let be handled by parent loop
                    break
        return self

    def _load_sub_quotes(self, character, request):
        """
        Return quotes for characters with dedicated page
        :return: list(Quote)
        """
        soup = BeautifulSoup(request.text)
        quote = Quote(character)  # instantiate a new quote to be completed
        for quote_el in soup.select('.citation'):
            is_tag = isinstance(quote_el, Tag)
            if not is_tag:
                continue
            if quote.is_complete():
                yield quote
                quote = Quote(character)
            quote.text = quote_el.text

            try:
                quote.meta = [ref.text for ref in quote_el.find_next_sibling('ul').select('.ref') if quote_el.find_next_sibling('ul')]
            except AttributeError:
                continue

    def _enqueue_quote(self, quote):
        try:
            self.quotes_per_char[quote.character].append(quote)
        except KeyError:
            self.quotes_per_char[quote.character] = [quote]

    def pickle(self):
        with open('quotes.pickle', 'wb') as f:
            pickle.dump(self.quotes_per_char, f, pickle.HIGHEST_PROTOCOL)


class Quote(object):
    TWEET_LIMIT = 280

    def __init__(self, character, text=None, meta=None):
        self.character = character
        self.text = text
        self.meta = meta

    def is_complete(self):
        return self.character and self.text and self.meta

    def _trim(self):
        suffix = "[…]- " + self.character + " #Kaamelott"
        text = self.text[:self.TWEET_LIMIT - len(suffix) - 1]
        return text + suffix

    @property
    def tweetable(self):
        tweet = self.text + "- " + self.character + " #Kaamelott"
        if len(tweet) > self.TWEET_LIMIT:
            tweet = self._trim()
        return tweet