diff --git a/scrape.php b/scrape.php index 7597d23..1b88f90 100644 --- a/scrape.php +++ b/scrape.php @@ -23,14 +23,25 @@ $scraper->debug = $debug; $scraper->scrape($start_urls, 0); // Download documents +$total_downloaded_docs = 0; +$new_scraped_docs = 0; + foreach ($scraper->results as $url => $scraped_obj) { if ($scraped_obj['is-downloadable']) { $client->request('GET', $url); - print '[*] Downloading '.$scraped_obj['filename']."\n"; + print '[+] Found '.$scraped_obj['filename']; if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){ + print " (NEW)\n"; file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() ); + $new_scraped_docs++; + } else { + print " (already downloaded)\n"; } + + $total_downloaded_docs++; } -} \ No newline at end of file +} + +print '[*] Downloaded '.$new_scraped_docs.'/'.$total_downloaded_docs." new documents\n"; \ No newline at end of file diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php index 8184b66..6912db8 100644 --- a/src/Scraping/Scraper.php +++ b/src/Scraping/Scraper.php @@ -35,9 +35,13 @@ class Scraper $this->results = array_merge( $this->results, $scraped_obj ); - $new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) { - return $node->link()->getUri(); - }); + $new_urls = []; + + if (isset($this->link_rules[$level])) { + $new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) { + return $node->link()->getUri(); + }); + } $sub_urls = array_merge( $sub_urls, $new_urls ); }