-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebengine.py
More file actions
122 lines (103 loc) · 4.59 KB
/
webengine.py
File metadata and controls
122 lines (103 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/local/bin/python
import argparse
import os
import requests
import shutil
from sys import argv, exit
from urlparse import urlparse
import commands
#Holds the path for the SSL cert. Prevents wget errors
cert_path = commands.getstatusoutput('echo $SSL_CERT_FILE')[1]
def worldEngine(url, href, src, option, resp, case):
baseURL = "http://" + urlparse(url).hostname
urlFile = "urlList"
raw_files = "raw_files"
whole_site = "whole_site"
extracted_files = "extracted_files"
html_files = "html_files"
html_dirty_files = "html_dirty_files"
completed_files = "completed_files"
image_files = "image_files"
file_ext = "md"
if case == "hyde":
file_ext = "html"
print "Will save completed_files with the file extension % s" % file_ext
if option != "site":
from url_gatherer import get_urlList
if get_urlList(url, urlFile, resp):
print "!! Error: url_gatherer did not finish !!"
else:
print "** url_gatherer finished **"
from file_gatherer import get_filesFromList
if get_filesFromList(urlFile, whole_site):
print "!! Error: file_gatherer did not finish !!"
else:
print "** file_gatherer finished **"
else:
from site_gatherer import get_siteFiles
if get_siteFiles(url, whole_site, "na", "na"):
print "!! Error: site_gaterer did not finish !!"
else:
print "** site_gaterer finished **"
from file_extractor import get_fileContent
if get_fileContent(whole_site, extracted_files):
print "!! Error: file_corrector did not finish !!"
else:
print "** file_extractor finished **"
from file_corrector import get_correctedFiles
if get_correctedFiles(extracted_files, html_files, href, src):
print "!! Error: file_corrector did not finish !!"
else:
print "** file_corrector finished **"
from html_table_2_markdown import get_correctedFiles
if get_correctedFiles(html_files, html_dirty_files):
print "!! Error: html_table_2_markdown did not finish !!"
else:
print "** html_table_2_markdown finished **"
from file_converter import get_convertedFiles
if get_convertedFiles(html_dirty_files, completed_files, file_ext):
print "!! Error file_converter did not finish !!"
else:
print "** file_converter finished **"
from image_gatherer import get_imageFiles
if get_imageFiles(extracted_files, image_files, baseURL):
print "!! Error image_gatherer did not finish !!"
else:
print "** image_gatherer finished **"
from bold_cleanup import get_correctedFiles
if get_correctedFiles(completed_files):
print "!! Error bold_cleanup did not finish !!"
else:
print "** bold_cleanup finished **"
from head_adder import add_headers
if add_headers(completed_files, case):
print "!! Error head_adder did not finish !!"
else:
print "** head_adder finished **"
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="get all content (html / images) from a wiki (site) and convert to markdown")
parser.add_argument("url", help="URL to site or wiki")
parser.add_argument("href", help="URL of the new site")
parser.add_argument("src", help="Path to the image directory")
parser.add_argument("type", help="\"site\" for full website or \"wiki\" for wiki Title Index page")
parser.add_argument("case", help="jekyll, hyde, none")
args = parser.parse_args()
resp = requests.get(args.url, verify= cert_path)
if resp.status_code >= 400:
print "!! Sorry, site / wiki is not reachable, error occurred. !!"
exit()
if worldEngine(args.url, args.href, args.src, args.type, resp, args.case):
print "!! Error worldengine did not finish !!"
else:
print "** site is habitable **"
print " "
print "whole_site -> The whole site from wget"
print "extracted_files -> just the body of the raw html files"
print "html_files -> extracted file corrected with new href and src links"
print "html_dirty_files -> html files with converted tables html to markdown"
print "completed_files -> md or html files as you asked for"
print "image_files -> all the image files from the old site (images)"
print " "
print "You may want to run md_files through file_trimmer.py to remove unwated headers and footers"
print "If so remember to then run head_adder.py again by hand"
print "to remove all generated files run webengine_cleanup.py"