-
Notifications
You must be signed in to change notification settings - Fork 0
Quick Start
The most basic senario for spider is requesting a html page or interface that returns json/xml as response. Pider supplys a greate convient way to make it.
use Pider\Spider;
use Pider\Http\Response;
use Pider\Http\Request;
class BasicSpider extends Spider {
protected $start_urls = ['https://item.jd.com/1304924.html'];
protected $domain = ['www.jd.com'];
public function start_requests(): array {
return [new Request(['base_uri'=> "https://item.jd.com/1304924.html"])];
}
public function parse(Response $response) {
echo "The first callback !".PHP_EOL;
return new Request(['base_uri'=> 'https://item.jd.com/302813.html'],
function($response) {
echo "The second callback !".PHP_EOL;
});
}
} The most import thing for a spider is to parse useful information from html or json/xml responses. Pider provides a callback mechanism for parsing pieces of information after responses. The default callback is parse that must be implemented ina spider instance if not any other callbacks are defined, but you can define your own callback during creating a Request object.
use Pider\Spider;
use Pider\Http\Response;
use Pider\Http\Request;
class GeneralistAttachSpider extends Spider {
protected $domain = ['www.jd.com'];
protected $start_urls = ['https://item.jd.com/16793643085.html'];
public function parse(Response $response) {
$request = new Request([
'base_uri'=>'https://p.3.cn/prices/mgets?callback=jQuery5925702&type=1&skuIds=J_2286746'],
function($response) {
$this->parse_price($response);
});
$request->attachment = ['test','test2'];
return $request;
}
public function parse_price(Response $response) {
var_dump($response->attachment);
}
} As for spider, most time is spent on network wires.So when it comes to a lot of requests, and we have to consider performance aspect.Most computers have more than one process which don't like ancient machines nowadays, So these are avaiable for us to make use of multi-process of computure to boost up our spider process, in order to save a bunch of our time.Of course, Pider can work with multi-processes.
use Pider\Spider;
use Pider\Http\Response;
class MultiSpider extends Spider {
protected $domains = [ 'www.jd.com' ];
protected $processes = 4;
protected $start_urls = [
'https://item.jd.com/1378700118.html',
'https://item.jd.com/302813.html',
'https://item.jd.com/1304924.html',
'https://item.jd.com/2286746.html'
];
protected $count = 1;
public function parse(Response $response) {
var_dump($response->getUrl());
$this->count++;
}
}