Fix minori

This commit is contained in:
loviuz 2022-01-09 01:25:50 +01:00
parent 6e7ce81d91
commit e730d97135
2 changed files with 20 additions and 5 deletions

View File

@ -23,14 +23,25 @@ $scraper->debug = $debug;
$scraper->scrape($start_urls, 0);
// Download documents
$total_downloaded_docs = 0;
$new_scraped_docs = 0;
foreach ($scraper->results as $url => $scraped_obj) {
if ($scraped_obj['is-downloadable']) {
$client->request('GET', $url);
print '[*] Downloading '.$scraped_obj['filename']."\n";
print '[+] Found '.$scraped_obj['filename'];
if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){
print " (NEW)\n";
file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
$new_scraped_docs++;
} else {
print " (already downloaded)\n";
}
$total_downloaded_docs++;
}
}
}
print '[*] Downloaded '.$new_scraped_docs.'/'.$total_downloaded_docs." new documents\n";

View File

@ -35,9 +35,13 @@ class Scraper
$this->results = array_merge( $this->results, $scraped_obj );
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
return $node->link()->getUri();
});
$new_urls = [];
if (isset($this->link_rules[$level])) {
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
return $node->link()->getUri();
});
}
$sub_urls = array_merge( $sub_urls, $new_urls );
}