From c6f1a797924a89dadb2a73815acbf86977bd2c10 Mon Sep 17 00:00:00 2001 From: loviuz Date: Sat, 8 Jan 2022 15:54:40 +0100 Subject: [PATCH] Completamento scraper con print links --- scrape.php | 4 +--- src/Scraping/Scraper.php | 45 +++++++++++++++++++++++++--------------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/scrape.php b/scrape.php index cef5aae..963f8b5 100644 --- a/scrape.php +++ b/scrape.php @@ -7,6 +7,4 @@ use Scraping\Scraper; $scraper = new Scraper(); -foreach ($filters as $i => $filter) { - $scraper->scrape('GET', $start_url, $filters, $i, $download_dir); -} +$scraper->scrape('GET', $start_urls, $filters, 0, $download_dir); diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php index 7f9e175..2aefcdd 100644 --- a/src/Scraping/Scraper.php +++ b/src/Scraping/Scraper.php @@ -8,32 +8,43 @@ class Scraper { public $links = []; - public static function scrape($method, $url, $filters, $level, $download_dir){ + public static function scrape($method, $urls, $filters, $level, $download_dir){ $client = new Client(); - $crawler = $client->request($method, $url); + $sub_urls = []; - if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) { - print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n"; + foreach ($urls as $url) { + $crawler = $client->request($method, $url); - $crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) { - $new_url = $node->link()->getUri(); - $new_level = $level+1; + //if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) { + print '['.$level.'] '.$url."\n"; - if ($new_level < count($filters)) { - self::scrape($method, $new_url, $filters, $new_level, $download_dir); + $new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) { + return $node->link()->getUri(); + }); + + $sub_urls = array_merge( $sub_urls, $new_urls ); + /* + } else { + $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; + + $filename = time(); + + if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { + $filename = $m[1]; } - }); - } else { - $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; - $filename = time(); - - if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { - $filename = $m[1]; + file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() ); } + */ + } - file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() ); + if($level==3){ + $a=1; + } + + if ($level++ < count($filters) && !empty($sub_urls)) { + self::scrape($method, $sub_urls, $filters, $level, $download_dir); } } } \ No newline at end of file