50 lines
1.4 KiB
PHP
50 lines
1.4 KiB
PHP
<?php
|
|
|
|
namespace Scraping;
|
|
|
|
use \Goutte\Client;
|
|
|
|
class Scraper
|
|
{
|
|
public $links = [];
|
|
|
|
public static function scrape($method, $urls, $filters, $level, $download_dir){
|
|
$client = new Client();
|
|
|
|
$sub_urls = [];
|
|
|
|
foreach ($urls as $url) {
|
|
$crawler = $client->request($method, $url);
|
|
|
|
//if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
|
|
print '['.$level.'] '.$url."\n";
|
|
|
|
$new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) {
|
|
return $node->link()->getUri();
|
|
});
|
|
|
|
$sub_urls = array_merge( $sub_urls, $new_urls );
|
|
/*
|
|
} else {
|
|
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
|
|
|
|
$filename = time();
|
|
|
|
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
|
|
$filename = $m[1];
|
|
}
|
|
|
|
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
|
|
}
|
|
*/
|
|
}
|
|
|
|
if($level==3){
|
|
$a=1;
|
|
}
|
|
|
|
if ($level++ < count($filters) && !empty($sub_urls)) {
|
|
self::scrape($method, $sub_urls, $filters, $level, $download_dir);
|
|
}
|
|
}
|
|
} |