diff --git a/config.example.php b/config.example.php index bfb9949..745871f 100644 --- a/config.example.php +++ b/config.example.php @@ -20,5 +20,8 @@ $allowedMimetypes = [ 'application/pdf' => 'pdf' ]; +// Specifica se visualizzare le URL scansionate +$debug = true; + // Directory dove salvare i file trovati $download_dir = __DIR__.'/pdf'; diff --git a/example/README.md b/example/README.md index 22352d3..3e834f1 100644 --- a/example/README.md +++ b/example/README.md @@ -22,4 +22,6 @@ Inserendo negli `$start_url` l'indirizzo sopra e avviando lo script, dovreste tr [2] http://localhost/example/pagina2/sub2.3 [3] http://localhost/example/pagina1/sub1.1/subsub1.1.1 [3] http://localhost/example/pagina1/sub1.1/subsub1.1.2 +[3] http://localhost/example/pagina1/sub1.1/test.pdf +[*] Downloading test.pdf ``` \ No newline at end of file diff --git a/scrape.php b/scrape.php index cfc561c..7a3003d 100644 --- a/scrape.php +++ b/scrape.php @@ -18,6 +18,7 @@ $scraper = new Scraper(); // Configure object $scraper->allowedMimetypes = $allowedMimetypes; $scraper->link_rules = $link_rules; +$scraper->debug = $debug; $scraper->scrape($start_urls, 0); diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php index 37874dc..e375c5c 100644 --- a/src/Scraping/Scraper.php +++ b/src/Scraping/Scraper.php @@ -9,6 +9,7 @@ class Scraper public $results = []; public $allowedMimetypes; public $link_rules; + public $debug = false; /** * Scrapes specified URLs @@ -24,8 +25,8 @@ class Scraper foreach ($urls as $url) { $crawler = $client->request('GET', $url); - if( $url == 'http://localhost/example/pagina1/sub1.1/test.pdf' ){ - $a = 1; + if ($this->debug) { + print '['.$level.'] '.$url."\n"; } $scraped_obj[$url]['content-type'] = $this->getContentType($client);