Fix minori

This commit is contained in:
loviuz 2022-01-09 01:25:50 +01:00
parent 6e7ce81d91
commit e730d97135
2 changed files with 20 additions and 5 deletions

View File

@ -23,14 +23,25 @@ $scraper->debug = $debug;
$scraper->scrape($start_urls, 0); $scraper->scrape($start_urls, 0);
// Download documents // Download documents
$total_downloaded_docs = 0;
$new_scraped_docs = 0;
foreach ($scraper->results as $url => $scraped_obj) { foreach ($scraper->results as $url => $scraped_obj) {
if ($scraped_obj['is-downloadable']) { if ($scraped_obj['is-downloadable']) {
$client->request('GET', $url); $client->request('GET', $url);
print '[*] Downloading '.$scraped_obj['filename']."\n"; print '[+] Found '.$scraped_obj['filename'];
if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){ if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){
print " (NEW)\n";
file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() ); file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
$new_scraped_docs++;
} else {
print " (already downloaded)\n";
} }
$total_downloaded_docs++;
} }
} }
print '[*] Downloaded '.$new_scraped_docs.'/'.$total_downloaded_docs." new documents\n";

View File

@ -35,9 +35,13 @@ class Scraper
$this->results = array_merge( $this->results, $scraped_obj ); $this->results = array_merge( $this->results, $scraped_obj );
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) { $new_urls = [];
return $node->link()->getUri();
}); if (isset($this->link_rules[$level])) {
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
return $node->link()->getUri();
});
}
$sub_urls = array_merge( $sub_urls, $new_urls ); $sub_urls = array_merge( $sub_urls, $new_urls );
} }