diff --git a/config.example.php b/config.example.php
index c74fbd3..bfb9949 100644
--- a/config.example.php
+++ b/config.example.php
@@ -1,10 +1,24 @@
estensione con cui salvare
+// i file
+$allowedMimetypes = [
+ 'application/pdf' => 'pdf'
+];
+
+// Directory dove salvare i file trovati
$download_dir = __DIR__.'/pdf';
diff --git a/example/pagina1/sub1.1/index.html b/example/pagina1/sub1.1/index.html
index 990f126..fad5e0f 100644
--- a/example/pagina1/sub1.1/index.html
+++ b/example/pagina1/sub1.1/index.html
@@ -1,2 +1,3 @@
Subsublink 1
-Subsublink 2
\ No newline at end of file
+Subsublink 2
+Document
\ No newline at end of file
diff --git a/example/pagina1/sub1.1/test.pdf b/example/pagina1/sub1.1/test.pdf
new file mode 100644
index 0000000..64a8753
Binary files /dev/null and b/example/pagina1/sub1.1/test.pdf differ
diff --git a/scrape.php b/scrape.php
index 963f8b5..cfc561c 100644
--- a/scrape.php
+++ b/scrape.php
@@ -1,10 +1,32 @@
\n");
+} else {
+ require __DIR__.'/'.$argv[1];
+}
+
+$client = new Client();
$scraper = new Scraper();
-$scraper->scrape('GET', $start_urls, $filters, 0, $download_dir);
+// Configure object
+$scraper->allowedMimetypes = $allowedMimetypes;
+$scraper->link_rules = $link_rules;
+
+$scraper->scrape($start_urls, 0);
+
+// Download documents
+foreach ($scraper->results as $url => $scraped_obj) {
+ if ($scraped_obj['is-downloadable']) {
+ $client->request('GET', $url);
+
+ print '[*] Downloading '.$scraped_obj['filename']."\n";
+ file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() );
+ }
+}
\ No newline at end of file
diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php
index 2aefcdd..37874dc 100644
--- a/src/Scraping/Scraper.php
+++ b/src/Scraping/Scraper.php
@@ -6,45 +6,103 @@ use \Goutte\Client;
class Scraper
{
- public $links = [];
+ public $results = [];
+ public $allowedMimetypes;
+ public $link_rules;
- public static function scrape($method, $urls, $filters, $level, $download_dir){
+ /**
+ * Scrapes specified URLs
+ *
+ * @param array $urls
+ * @param int $level
+ */
+ public function scrape(array $urls, int $level){
$client = new Client();
$sub_urls = [];
foreach ($urls as $url) {
- $crawler = $client->request($method, $url);
+ $crawler = $client->request('GET', $url);
- //if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
- print '['.$level.'] '.$url."\n";
-
- $new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) {
- return $node->link()->getUri();
- });
-
- $sub_urls = array_merge( $sub_urls, $new_urls );
- /*
- } else {
- $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
-
- $filename = time();
-
- if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
- $filename = $m[1];
- }
-
- file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
+ if( $url == 'http://localhost/example/pagina1/sub1.1/test.pdf' ){
+ $a = 1;
}
- */
+
+ $scraped_obj[$url]['content-type'] = $this->getContentType($client);
+ $scraped_obj[$url]['is-downloadable'] = $this->isDownloadable($client);
+ $scraped_obj[$url]['filename'] = ( $scraped_obj[$url]['is-downloadable'] ? $this->getFilename($client) : '' );
+
+ $this->results = array_merge( $this->results, $scraped_obj );
+
+ $new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
+ return $node->link()->getUri();
+ });
+
+ $sub_urls = array_merge( $sub_urls, $new_urls );
}
- if($level==3){
- $a=1;
- }
-
- if ($level++ < count($filters) && !empty($sub_urls)) {
- self::scrape($method, $sub_urls, $filters, $level, $download_dir);
+ if ($level++ < count($this->link_rules) && !empty($sub_urls)) {
+ $this->scrape($sub_urls, $level);
}
}
+
+ /**
+ * Returns if URL is downloadable
+ *
+ * @return boolean
+ *
+ * @param Client $client
+ */
+ public function isDownloadable(Client $client){
+ foreach ($this->allowedMimetypes as $mimetype) {
+ if (isset($client->getResponse()->getHeaders()['content-type'][0])) {
+ if (strstr($client->getResponse()->getHeaders()['content-type'][0], $mimetype)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Get filename of http response based on URL or content-disposition header
+ *
+ * @return string
+ *
+ * @param Client $client
+ */
+ public function getFilename(Client $client){
+ $filename = basename( $client->getRequest()->getUri() );
+
+ // Try to get filename from content-disposition
+ if (isset($client->getResponse()->getHeaders()['content-disposition'][0])) {
+ $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
+
+ if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
+ $filename = $m[1].'-'.$this->allowedMimetypes[ $this->getContentType($client) ];
+ } else {
+ $filename = time();
+ }
+ }
+
+ return $filename;
+ }
+
+ /**
+ * Get content type of http response
+ *
+ * @return string
+ *
+ * @param Client $client
+ */
+ public static function getContentType(Client $client){
+ $content_type = '';
+
+ if (isset($client->getResponse()->getHeaders()['content-type'][0])) {
+ $content_type = $client->getResponse()->getHeaders()['content-type'][0];
+ }
+
+ return $content_type;
+ }
}
\ No newline at end of file