Fix minori
This commit is contained in:
parent
6e7ce81d91
commit
e730d97135
15
scrape.php
15
scrape.php
@ -23,14 +23,25 @@ $scraper->debug = $debug;
|
||||
$scraper->scrape($start_urls, 0);
|
||||
|
||||
// Download documents
|
||||
$total_downloaded_docs = 0;
|
||||
$new_scraped_docs = 0;
|
||||
|
||||
foreach ($scraper->results as $url => $scraped_obj) {
|
||||
if ($scraped_obj['is-downloadable']) {
|
||||
$client->request('GET', $url);
|
||||
|
||||
print '[*] Downloading '.$scraped_obj['filename']."\n";
|
||||
print '[+] Found '.$scraped_obj['filename'];
|
||||
|
||||
if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){
|
||||
print " (NEW)\n";
|
||||
file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
|
||||
$new_scraped_docs++;
|
||||
} else {
|
||||
print " (already downloaded)\n";
|
||||
}
|
||||
|
||||
$total_downloaded_docs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print '[*] Downloaded '.$new_scraped_docs.'/'.$total_downloaded_docs." new documents\n";
|
@ -35,9 +35,13 @@ class Scraper
|
||||
|
||||
$this->results = array_merge( $this->results, $scraped_obj );
|
||||
|
||||
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
|
||||
return $node->link()->getUri();
|
||||
});
|
||||
$new_urls = [];
|
||||
|
||||
if (isset($this->link_rules[$level])) {
|
||||
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
|
||||
return $node->link()->getUri();
|
||||
});
|
||||
}
|
||||
|
||||
$sub_urls = array_merge( $sub_urls, $new_urls );
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user