Fix minori
This commit is contained in:
parent
6e7ce81d91
commit
e730d97135
13
scrape.php
13
scrape.php
@ -23,14 +23,25 @@ $scraper->debug = $debug;
|
|||||||
$scraper->scrape($start_urls, 0);
|
$scraper->scrape($start_urls, 0);
|
||||||
|
|
||||||
// Download documents
|
// Download documents
|
||||||
|
$total_downloaded_docs = 0;
|
||||||
|
$new_scraped_docs = 0;
|
||||||
|
|
||||||
foreach ($scraper->results as $url => $scraped_obj) {
|
foreach ($scraper->results as $url => $scraped_obj) {
|
||||||
if ($scraped_obj['is-downloadable']) {
|
if ($scraped_obj['is-downloadable']) {
|
||||||
$client->request('GET', $url);
|
$client->request('GET', $url);
|
||||||
|
|
||||||
print '[*] Downloading '.$scraped_obj['filename']."\n";
|
print '[+] Found '.$scraped_obj['filename'];
|
||||||
|
|
||||||
if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){
|
if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){
|
||||||
|
print " (NEW)\n";
|
||||||
file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
|
file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
|
||||||
|
$new_scraped_docs++;
|
||||||
|
} else {
|
||||||
|
print " (already downloaded)\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$total_downloaded_docs++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
print '[*] Downloaded '.$new_scraped_docs.'/'.$total_downloaded_docs." new documents\n";
|
@ -35,9 +35,13 @@ class Scraper
|
|||||||
|
|
||||||
$this->results = array_merge( $this->results, $scraped_obj );
|
$this->results = array_merge( $this->results, $scraped_obj );
|
||||||
|
|
||||||
|
$new_urls = [];
|
||||||
|
|
||||||
|
if (isset($this->link_rules[$level])) {
|
||||||
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
|
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
|
||||||
return $node->link()->getUri();
|
return $node->link()->getUri();
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
$sub_urls = array_merge( $sub_urls, $new_urls );
|
$sub_urls = array_merge( $sub_urls, $new_urls );
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user