-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathjscrap.js
More file actions
154 lines (132 loc) · 3.96 KB
/
jscrap.js
File metadata and controls
154 lines (132 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"use strict";
var
http = require('http'),
https = require('https'),
zlib = require('zlib'),
htmlparser = require("htmlparser"),
zcsel = require('zcsel');
exports.scrap = function(url_or_data,opts,handler) {
var
self = this,
args = Array.prototype.slice.call(arguments, 0),
data = url_or_data,
pHandler,
parser,
pstart;
url_or_data = args.shift() || null;
handler = args.pop() || null;
opts = args.shift() || null;
if ( !url_or_data )
throw new Error("No URL or HTML data to scrap");
if ( !handler )
throw new Error("No callback");
// Input is an URL ? Get it!
_if ( data.match(/^https?:\/\//),
function(next){
self._get(url_or_data,opts,function(err,pageData,res){
if ( err )
return next(err,res);
data = pageData;
next(null,res);
});
},
function(err,res) {
if ( err )
return handler(err,null,res);
// Parse
pstart = new Date();
pHandler = new htmlparser.DefaultHandler(function(err,doc){
if ( opts && opts.debug )
console.log("HTML Parse: took "+(new Date()-pstart)+" ms");
if ( err )
return handler(err,null,res);
// Initialize document with ZCSEL and return it
var
istart = new Date(),
$ = zcsel.initDom(doc);
if ( opts && opts.debug )
console.log("ZCSel Init: took "+(new Date()-istart)+" ms");
return handler(null,$,res);
});
parser = new htmlparser.Parser(pHandler);
return parser.parseComplete(data);
}
);
};
exports._get = function(url, opts, handler) {
var
args = Array.prototype.slice.call(arguments, 0),
httpMod,
zipDecoder,
content = "",
start = new Date(),
timeout = null;
url = args.shift() || null;
handler = args.pop() || null;
opts = args.shift() || { followRedirects: 3, charsetEncoding: "utf-8" };
// Validation
if ( !url )
throw new Error("No URL to GET");
if ( !handler )
throw new Error("No callback");
// Auto-complete/normalize
if (typeof(url) == 'string') {
url = require('url').parse(url);
}
if (opts.headers) {
url.headers = opts.headers;
}
// Create a pseudo callback which destroys herself after being used
var _handler = function(err,data,res){
_handler = function(){};
if ( timeout )
clearTimeout(timeout);
handler(err,data,res);
};
// Timeout ? Start counting..
if ( opts.timeout ) {
timeout = setTimeout(function(){
_handler(new Error("HTTP request timeout after "+opts.timeout+" ms"),null,null);
},opts.timeout);
}
// GET
httpMod = url.protocol.match(/^https:/) ? https : http;
var req = httpMod.get(url, function(res){
if ( res.statusCode > 400 )
return _handler(new Error("Got HTTP status code "+res.statusCode+" on "+url),null,res);
if (res.statusCode >= 300 && res.statusCode < 400) {
if (res.headers['location'] && res.headers['location'].replace(/^[\s\r\n]*|[\s\r\n]*$/g,"") && opts.followRedirects) {
var location = require('url').resolve(url, res.headers['location']);
res.headers.location = location;
opts.followRedirects--;
return exports._get(location, opts, _handler);
}
return _handler(new Error("Found redirect without Location header"), null, res);
}
// Watch content encoding
if (res.headers['content-encoding']) {
var enc = res.headers['content-encoding'].toString().toLowerCase().replace(/^\s*|\s*$/g,"");
if ( enc == "gzip" )
zipDecoder = zlib.createGunzip();
else if ( enc == "deflate" )
zipDecoder = zlib.createInflate();
else
return _handler(new Error("Unsupported document encoding '"+enc+"'"),null);
res.pipe(zipDecoder);
}
// GET data
(zipDecoder || res).setEncoding(opts.charsetEncoding || "utf-8");
(zipDecoder || res).on('data',function(d){ content += d.toString(); });
(zipDecoder || res).on('end',function(){
if ( opts.debug )
console.log("HTTP GET: took "+(new Date()-start)+" ms");
return _handler(null,content,res);
});
})
.on('error',function(err){
return _handler(err,null,null);
});
};
function _if(cond,a,b){
return cond ? a(b) : b();
}