Compare commits

...

2 Commits

Author SHA1 Message Date
loviuz c6f1a79792 Completamento scraper con print links 2022-01-08 15:54:40 +01:00
loviuz 70dcd2deea Aggiornamento guida finale 2022-01-08 15:54:22 +01:00
5 changed files with 40 additions and 30 deletions

View File

@ -11,14 +11,15 @@ Copia anche il file `example/config.php` nella root dello scraper, così da conf
Inserendo negli `$start_url` l'indirizzo sopra e avviando lo script, l'output dovrebbe essere:
```
[0] http://localhost/example/pagina1
[1] http://localhost/example/pagina1/sub1.1
[1] http://localhost/example/pagina1/sub1.2
[1] http://localhost/example/pagina1/sub1.3
[2] http://localhost/example/pagina1/sub1.1/subsub1.1.1
[2] http://localhost/example/pagina1/sub1.1/subsub1.1.2
[0] http://localhost/example/pagina2
[1] http://localhost/example/pagina2/sub2.1
[1] http://localhost/example/pagina2/sub2.2
[1] http://localhost/example/pagina2/sub2.3
[0] http://localhost/example
[1] http://localhost/example/pagina1
[1] http://localhost/example/pagina2
[2] http://localhost/example/pagina1/sub1.1
[2] http://localhost/example/pagina1/sub1.2
[2] http://localhost/example/pagina1/sub1.3
[2] http://localhost/example/pagina2/sub2.1
[2] http://localhost/example/pagina2/sub2.2
[2] http://localhost/example/pagina2/sub2.3
[3] http://localhost/example/pagina1/sub1.1/subsub1.1.1
[3] http://localhost/example/pagina1/sub1.1/subsub1.1.2
```

View File

View File

View File

@ -7,6 +7,4 @@ use Scraping\Scraper;
$scraper = new Scraper();
foreach ($filters as $i => $filter) {
$scraper->scrape('GET', $start_url, $filters, $i, $download_dir);
}
$scraper->scrape('GET', $start_urls, $filters, 0, $download_dir);

View File

@ -8,32 +8,43 @@ class Scraper
{
public $links = [];
public static function scrape($method, $url, $filters, $level, $download_dir){
public static function scrape($method, $urls, $filters, $level, $download_dir){
$client = new Client();
$crawler = $client->request($method, $url);
$sub_urls = [];
if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
foreach ($urls as $url) {
$crawler = $client->request($method, $url);
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
$new_url = $node->link()->getUri();
$new_level = $level+1;
//if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
print '['.$level.'] '.$url."\n";
if ($new_level < count($filters)) {
self::scrape($method, $new_url, $filters, $new_level, $download_dir);
$new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) {
return $node->link()->getUri();
});
$sub_urls = array_merge( $sub_urls, $new_urls );
/*
} else {
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
$filename = time();
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
$filename = $m[1];
}
});
} else {
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
$filename = time();
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
$filename = $m[1];
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
}
*/
}
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
if($level==3){
$a=1;
}
if ($level++ < count($filters) && !empty($sub_urls)) {
self::scrape($method, $sub_urls, $filters, $level, $download_dir);
}
}
}