Completamento classe scraping

This commit is contained in:
loviuz 2022-01-08 14:28:07 +01:00
parent c236ed9590
commit 71df5bcdcc
3 changed files with 23 additions and 12 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ composer.phar
composer.lock composer.lock
pdf/* pdf/*
config.php config.php
launch.json

View File

@ -1,11 +1,12 @@
<?php <?php
require __DIR__.'/vendor/autoload.php'; require __DIR__.'/vendor/autoload.php';
require __DIR__.'/config.php';
use Scraping\Scraper; use Scraping\Scraper;
$scraper = new Scraper(); $scraper = new Scraper();
foreach ($filters as $i => $filter) { foreach ($filters as $i => $filter) {
$scraper->scrape('GET', $start_url, $filters, $i); $scraper->scrape('GET', $start_url, $filters, $i, $download_dir);
} }

View File

@ -8,23 +8,32 @@ class Scraper
{ {
public $links = []; public $links = [];
public static function scrape($method, $url, $filters, $level){ public static function scrape($method, $url, $filters, $level, $download_dir){
$client = new Client(); $client = new Client();
$crawler = $client->request($method, $url); $crawler = $client->request($method, $url);
if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n"; print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
// 1) Lista atti
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) { $crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
$new_url = $node->link()->getUri(); $new_url = $node->link()->getUri();
$new_level = $level+1; $new_level = $level+1;
//print str_pad('['.$new_level.']', $new_level*2, ' ', STR_PAD_RIGHT).' '.$new_url."\n";
if ($new_level < count($filters)) { if ($new_level < count($filters)) {
self::scrape($method, $new_url, $filters, $new_level); self::scrape($method, $new_url, $filters, $new_level, $download_dir);
} }
}); });
} else {
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
$filename = time();
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
$filename = $m[1];
}
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
}
} }
} }