Completamento scraper con print links

Aggiornamento guida finale
2022-01-08 15:54:40 +01:00 · 2022-01-08 15:54:22 +01:00
5 changed files with 40 additions and 30 deletions
--- a/example/README.md
+++ b/example/README.md
@ -11,14 +11,15 @@ Copia anche il file `example/config.php` nella root dello scraper, così da conf
 Inserendo negli `$start_url` l'indirizzo sopra e avviando lo script, l'output dovrebbe essere:

 ```
-[0] http://localhost/example/pagina1
-[1] http://localhost/example/pagina1/sub1.1
-[1] http://localhost/example/pagina1/sub1.2
-[1] http://localhost/example/pagina1/sub1.3
-[2] http://localhost/example/pagina1/sub1.1/subsub1.1.1
-[2] http://localhost/example/pagina1/sub1.1/subsub1.1.2
-[0] http://localhost/example/pagina2
-[1] http://localhost/example/pagina2/sub2.1
-[1] http://localhost/example/pagina2/sub2.2
-[1] http://localhost/example/pagina2/sub2.3
+[0] http://localhost/example
+[1] http://localhost/example/pagina1
+[1] http://localhost/example/pagina2
+[2] http://localhost/example/pagina1/sub1.1
+[2] http://localhost/example/pagina1/sub1.2
+[2] http://localhost/example/pagina1/sub1.3
+[2] http://localhost/example/pagina2/sub2.1
+[2] http://localhost/example/pagina2/sub2.2
+[2] http://localhost/example/pagina2/sub2.3
+[3] http://localhost/example/pagina1/sub1.1/subsub1.1.1
+[3] http://localhost/example/pagina1/sub1.1/subsub1.1.2
 ```
--- a/example/pagina1/sub1.2/index.html
+++ b/example/pagina1/sub1.2/index.html
--- a/example/pagina1/sub1.3/index.html
+++ b/example/pagina1/sub1.3/index.html
--- a/scrape.php
+++ b/scrape.php
@ -7,6 +7,4 @@ use Scraping\Scraper;

 $scraper = new Scraper();

-foreach ($filters as $i => $filter) {
-    $scraper->scrape('GET', $start_url, $filters, $i, $download_dir);
-}
+$scraper->scrape('GET', $start_urls, $filters, 0, $download_dir);
--- a/src/Scraping/Scraper.php
+++ b/src/Scraping/Scraper.php
@ -8,32 +8,43 @@ class Scraper
 {
    public $links = [];

-    public static function scrape($method, $url, $filters, $level, $download_dir){
+    public static function scrape($method, $urls, $filters, $level, $download_dir){
        $client = new Client();

-        $crawler = $client->request($method, $url);
+        $sub_urls = [];

-        if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
-            print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
+        foreach ($urls as $url) {
+            $crawler = $client->request($method, $url);

-            $crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
-                $new_url = $node->link()->getUri();
-                $new_level = $level+1;
+            //if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
+                print '['.$level.'] '.$url."\n";

-                if ($new_level < count($filters)) {
-                    self::scrape($method, $new_url, $filters, $new_level, $download_dir);
+                $new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) {
+                    return $node->link()->getUri();
+                });
+
+                $sub_urls = array_merge( $sub_urls, $new_urls );
+            /*
+            } else {
+                $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
+
+                $filename = time();
+                
+                if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
+                    $filename = $m[1];
                }
-            });
-        } else {
-            $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];

-            $filename = time();
-            
-            if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
-                $filename = $m[1];
+                file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
            }
+            */
+        }

-            file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
+        if($level==3){
+            $a=1;
+        }
+
+        if ($level++ < count($filters) && !empty($sub_urls)) {
+            self::scrape($method, $sub_urls, $filters, $level, $download_dir);
        }
    }
 }
Author	SHA1	Message	Date
loviuz	c6f1a79792	Completamento scraper con print links	2022-01-08 15:54:40 +01:00
loviuz	70dcd2deea	Aggiornamento guida finale	2022-01-08 15:54:22 +01:00