forked from attardi/wikiextractor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathignoredTags.py
More file actions
24 lines (21 loc) · 761 Bytes
/
ignoredTags.py
File metadata and controls
24 lines (21 loc) · 761 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
# These tags are dropped, keeping their content.
# handle 'a' separately, depending on keepLinks
ignoredTags = [
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'sub', 'sup', 'tt', 'u', 'var'
]
# Match ignored tags
ignored_tag_patterns = []
def ignoreTag(tag):
left = re.compile(r'<%s\b[^>/]*>' % tag, re.IGNORECASE) # both <ref> and <reference>
right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
ignored_tag_patterns.append((left, right))
def getIgnoredTags():
for tag in ignoredTags:
ignoreTag(tag)
return ignored_tag_patterns