Completamento scraper con print links
This commit is contained in:
parent
70dcd2deea
commit
c6f1a79792
@ -7,6 +7,4 @@ use Scraping\Scraper;
|
||||
|
||||
$scraper = new Scraper();
|
||||
|
||||
foreach ($filters as $i => $filter) {
|
||||
$scraper->scrape('GET', $start_url, $filters, $i, $download_dir);
|
||||
}
|
||||
$scraper->scrape('GET', $start_urls, $filters, 0, $download_dir);
|
||||
|
@ -8,32 +8,43 @@ class Scraper
|
||||
{
|
||||
public $links = [];
|
||||
|
||||
public static function scrape($method, $url, $filters, $level, $download_dir){
|
||||
public static function scrape($method, $urls, $filters, $level, $download_dir){
|
||||
$client = new Client();
|
||||
|
||||
$crawler = $client->request($method, $url);
|
||||
$sub_urls = [];
|
||||
|
||||
if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
|
||||
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
|
||||
foreach ($urls as $url) {
|
||||
$crawler = $client->request($method, $url);
|
||||
|
||||
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
|
||||
$new_url = $node->link()->getUri();
|
||||
$new_level = $level+1;
|
||||
//if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
|
||||
print '['.$level.'] '.$url."\n";
|
||||
|
||||
if ($new_level < count($filters)) {
|
||||
self::scrape($method, $new_url, $filters, $new_level, $download_dir);
|
||||
$new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) {
|
||||
return $node->link()->getUri();
|
||||
});
|
||||
|
||||
$sub_urls = array_merge( $sub_urls, $new_urls );
|
||||
/*
|
||||
} else {
|
||||
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
|
||||
|
||||
$filename = time();
|
||||
|
||||
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
|
||||
$filename = $m[1];
|
||||
}
|
||||
});
|
||||
} else {
|
||||
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
|
||||
|
||||
$filename = time();
|
||||
|
||||
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
|
||||
$filename = $m[1];
|
||||
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
|
||||
if($level==3){
|
||||
$a=1;
|
||||
}
|
||||
|
||||
if ($level++ < count($filters) && !empty($sub_urls)) {
|
||||
self::scrape($method, $sub_urls, $filters, $level, $download_dir);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user