structured_scraper/scrape.php

47 lines
1.1 KiB
PHP

<?php
require __DIR__.'/vendor/autoload.php';
use Scraping\Scraper;
use Goutte\Client;
// Usage
if (count($argv) == 1) {
die('Usage: '.$argv[0]." <config.php>\n");
} else {
require __DIR__.'/'.$argv[1];
}
$client = new Client();
$scraper = new Scraper();
// Configure object
$scraper->allowedMimetypes = $allowedMimetypes;
$scraper->link_rules = $link_rules;
$scraper->debug = $debug;
$scraper->scrape($start_urls, 0);
// Download documents
$total_downloaded_docs = 0;
$new_scraped_docs = 0;
foreach ($scraper->results as $url => $scraped_obj) {
if ($scraped_obj['is-downloadable']) {
$client->request('GET', $url);
print '[+] Found '.$scraped_obj['filename'];
if( !file_exists($download_dir.'/'.$scraped_obj['filename']) ){
print " (NEW)\n";
file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
$new_scraped_docs++;
} else {
print " (already downloaded)\n";
}
$total_downloaded_docs++;
}
}
print '[*] Downloaded '.$new_scraped_docs.'/'.$total_downloaded_docs." new documents\n";