diff --git a/config.example.php b/config.example.php index c74fbd3..bfb9949 100644 --- a/config.example.php +++ b/config.example.php @@ -1,10 +1,24 @@ estensione con cui salvare +// i file +$allowedMimetypes = [ + 'application/pdf' => 'pdf' +]; + +// Directory dove salvare i file trovati $download_dir = __DIR__.'/pdf'; diff --git a/example/pagina1/sub1.1/index.html b/example/pagina1/sub1.1/index.html index 990f126..fad5e0f 100644 --- a/example/pagina1/sub1.1/index.html +++ b/example/pagina1/sub1.1/index.html @@ -1,2 +1,3 @@ Subsublink 1
-Subsublink 2
\ No newline at end of file +Subsublink 2
+Document
\ No newline at end of file diff --git a/example/pagina1/sub1.1/test.pdf b/example/pagina1/sub1.1/test.pdf new file mode 100644 index 0000000..64a8753 Binary files /dev/null and b/example/pagina1/sub1.1/test.pdf differ diff --git a/scrape.php b/scrape.php index 963f8b5..cfc561c 100644 --- a/scrape.php +++ b/scrape.php @@ -1,10 +1,32 @@ \n"); +} else { + require __DIR__.'/'.$argv[1]; +} + +$client = new Client(); $scraper = new Scraper(); -$scraper->scrape('GET', $start_urls, $filters, 0, $download_dir); +// Configure object +$scraper->allowedMimetypes = $allowedMimetypes; +$scraper->link_rules = $link_rules; + +$scraper->scrape($start_urls, 0); + +// Download documents +foreach ($scraper->results as $url => $scraped_obj) { + if ($scraped_obj['is-downloadable']) { + $client->request('GET', $url); + + print '[*] Downloading '.$scraped_obj['filename']."\n"; + file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() ); + } +} \ No newline at end of file diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php index 2aefcdd..37874dc 100644 --- a/src/Scraping/Scraper.php +++ b/src/Scraping/Scraper.php @@ -6,45 +6,103 @@ use \Goutte\Client; class Scraper { - public $links = []; + public $results = []; + public $allowedMimetypes; + public $link_rules; - public static function scrape($method, $urls, $filters, $level, $download_dir){ + /** + * Scrapes specified URLs + * + * @param array $urls + * @param int $level + */ + public function scrape(array $urls, int $level){ $client = new Client(); $sub_urls = []; foreach ($urls as $url) { - $crawler = $client->request($method, $url); + $crawler = $client->request('GET', $url); - //if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) { - print '['.$level.'] '.$url."\n"; - - $new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) { - return $node->link()->getUri(); - }); - - $sub_urls = array_merge( $sub_urls, $new_urls ); - /* - } else { - $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; - - $filename = time(); - - if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { - $filename = $m[1]; - } - - file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() ); + if( $url == 'http://localhost/example/pagina1/sub1.1/test.pdf' ){ + $a = 1; } - */ + + $scraped_obj[$url]['content-type'] = $this->getContentType($client); + $scraped_obj[$url]['is-downloadable'] = $this->isDownloadable($client); + $scraped_obj[$url]['filename'] = ( $scraped_obj[$url]['is-downloadable'] ? $this->getFilename($client) : '' ); + + $this->results = array_merge( $this->results, $scraped_obj ); + + $new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) { + return $node->link()->getUri(); + }); + + $sub_urls = array_merge( $sub_urls, $new_urls ); } - if($level==3){ - $a=1; - } - - if ($level++ < count($filters) && !empty($sub_urls)) { - self::scrape($method, $sub_urls, $filters, $level, $download_dir); + if ($level++ < count($this->link_rules) && !empty($sub_urls)) { + $this->scrape($sub_urls, $level); } } + + /** + * Returns if URL is downloadable + * + * @return boolean + * + * @param Client $client + */ + public function isDownloadable(Client $client){ + foreach ($this->allowedMimetypes as $mimetype) { + if (isset($client->getResponse()->getHeaders()['content-type'][0])) { + if (strstr($client->getResponse()->getHeaders()['content-type'][0], $mimetype)) { + return true; + } + } + } + + return false; + } + + /** + * Get filename of http response based on URL or content-disposition header + * + * @return string + * + * @param Client $client + */ + public function getFilename(Client $client){ + $filename = basename( $client->getRequest()->getUri() ); + + // Try to get filename from content-disposition + if (isset($client->getResponse()->getHeaders()['content-disposition'][0])) { + $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; + + if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { + $filename = $m[1].'-'.$this->allowedMimetypes[ $this->getContentType($client) ]; + } else { + $filename = time(); + } + } + + return $filename; + } + + /** + * Get content type of http response + * + * @return string + * + * @param Client $client + */ + public static function getContentType(Client $client){ + $content_type = ''; + + if (isset($client->getResponse()->getHeaders()['content-type'][0])) { + $content_type = $client->getResponse()->getHeaders()['content-type'][0]; + } + + return $content_type; + } } \ No newline at end of file