2022-01-06 17:20:40 +01:00
|
|
|
<?php
|
|
|
|
|
|
|
|
require __DIR__.'/vendor/autoload.php';
|
|
|
|
|
|
|
|
use Scraping\Scraper;
|
2022-01-08 20:30:33 +01:00
|
|
|
use Goutte\Client;
|
2022-01-06 17:20:40 +01:00
|
|
|
|
2022-01-08 20:30:33 +01:00
|
|
|
// Usage
|
|
|
|
if (count($argv) == 1) {
|
|
|
|
die('Usage: '.$argv[0]." <config.php>\n");
|
|
|
|
} else {
|
|
|
|
require __DIR__.'/'.$argv[1];
|
|
|
|
}
|
|
|
|
|
|
|
|
$client = new Client();
|
2022-01-06 17:20:40 +01:00
|
|
|
$scraper = new Scraper();
|
|
|
|
|
2022-01-08 20:30:33 +01:00
|
|
|
// Configure object
|
|
|
|
$scraper->allowedMimetypes = $allowedMimetypes;
|
|
|
|
$scraper->link_rules = $link_rules;
|
2022-01-08 20:41:41 +01:00
|
|
|
$scraper->debug = $debug;
|
2022-01-08 20:30:33 +01:00
|
|
|
|
|
|
|
$scraper->scrape($start_urls, 0);
|
|
|
|
|
|
|
|
// Download documents
|
2022-01-09 01:25:50 +01:00
|
|
|
$total_downloaded_docs = 0;
|
|
|
|
$new_scraped_docs = 0;
|
|
|
|
|
2022-01-08 20:30:33 +01:00
|
|
|
foreach ($scraper->results as $url => $scraped_obj) {
|
|
|
|
if ($scraped_obj['is-downloadable']) {
|
|
|
|
$client->request('GET', $url);
|
|
|
|
|
2022-01-09 01:25:50 +01:00
|
|
|
print '[+] Found '.$scraped_obj['filename'];
|
2022-01-09 01:15:58 +01:00
|
|
|
|
|
|
|
if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){
|
2022-01-09 01:25:50 +01:00
|
|
|
print " (NEW)\n";
|
2022-01-09 01:15:58 +01:00
|
|
|
file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
|
2022-01-09 01:25:50 +01:00
|
|
|
$new_scraped_docs++;
|
|
|
|
} else {
|
|
|
|
print " (already downloaded)\n";
|
2022-01-09 01:15:58 +01:00
|
|
|
}
|
2022-01-09 01:25:50 +01:00
|
|
|
|
|
|
|
$total_downloaded_docs++;
|
2022-01-08 20:30:33 +01:00
|
|
|
}
|
2022-01-09 01:25:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
print '[*] Downloaded '.$new_scraped_docs.'/'.$total_downloaded_docs." new documents\n";
|