python3-torrent/html_parser.py at master · wenwen8/python3-torrent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from bs4 import BeautifulSoup
import re
import urllib.parse


class HtmlParser(object):

    # 解析种子文件
    def parserTwo(self,html):
        if html is None:
            return
        soup = BeautifulSoup(html,'html.parser',from_encoding='utf-8')
        res_datas = self._get_data(soup)
        return res_datas

    # 将种子文件的标题，磁力链接和迅雷链接进行封装
    def _get_data(self,soup):
        res_datas = []
        all_data = soup.findAll('a',href=re.compile(r"/detail"))
        all_data2 = soup.findAll('a', href=re.compile(r"magnet"))
        all_data3 = soup.findAll('a',href=re.compile(r"thunder"))
        for i in range(len(all_data)):
            res_data = {}
            res_data['title'] = all_data[i].get_text()
            res_data['cl'] = all_data2[i].get('href')
            res_data['xl'] = all_data3[i].get('href')
            res_datas.append(res_data)
        return res_datas