From 71df5bcdccf298d84be49262c01ddf2bace85bb3 Mon Sep 17 00:00:00 2001 From: loviuz Date: Sat, 8 Jan 2022 14:28:07 +0100 Subject: [PATCH] Completamento classe scraping --- .gitignore | 3 ++- scrape.php | 3 ++- src/Scraping/Scraper.php | 29 +++++++++++++++++++---------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 1577a3c..48f15df 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ vendor/ composer.phar composer.lock pdf/* -config.php \ No newline at end of file +config.php +launch.json \ No newline at end of file diff --git a/scrape.php b/scrape.php index db9d0a3..cef5aae 100644 --- a/scrape.php +++ b/scrape.php @@ -1,11 +1,12 @@ $filter) { - $scraper->scrape('GET', $start_url, $filters, $i); + $scraper->scrape('GET', $start_url, $filters, $i, $download_dir); } diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php index 73b098d..7f9e175 100644 --- a/src/Scraping/Scraper.php +++ b/src/Scraping/Scraper.php @@ -8,23 +8,32 @@ class Scraper { public $links = []; - public static function scrape($method, $url, $filters, $level){ + public static function scrape($method, $url, $filters, $level, $download_dir){ $client = new Client(); $crawler = $client->request($method, $url); - print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n"; + if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) { + print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n"; - // 1) Lista atti - $crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) { - $new_url = $node->link()->getUri(); - $new_level = $level+1; + $crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) { + $new_url = $node->link()->getUri(); + $new_level = $level+1; - //print str_pad('['.$new_level.']', $new_level*2, ' ', STR_PAD_RIGHT).' '.$new_url."\n"; + if ($new_level < count($filters)) { + self::scrape($method, $new_url, $filters, $new_level, $download_dir); + } + }); + } else { + $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; - if ($new_level < count($filters)) { - self::scrape($method, $new_url, $filters, $new_level); + $filename = time(); + + if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { + $filename = $m[1]; } - }); + + file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() ); + } } } \ No newline at end of file