Completamento classe scraping
This commit is contained in:
parent
c236ed9590
commit
71df5bcdcc
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,3 +3,4 @@ composer.phar
|
|||||||
composer.lock
|
composer.lock
|
||||||
pdf/*
|
pdf/*
|
||||||
config.php
|
config.php
|
||||||
|
launch.json
|
@ -1,11 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require __DIR__.'/vendor/autoload.php';
|
require __DIR__.'/vendor/autoload.php';
|
||||||
|
require __DIR__.'/config.php';
|
||||||
|
|
||||||
use Scraping\Scraper;
|
use Scraping\Scraper;
|
||||||
|
|
||||||
$scraper = new Scraper();
|
$scraper = new Scraper();
|
||||||
|
|
||||||
foreach ($filters as $i => $filter) {
|
foreach ($filters as $i => $filter) {
|
||||||
$scraper->scrape('GET', $start_url, $filters, $i);
|
$scraper->scrape('GET', $start_url, $filters, $i, $download_dir);
|
||||||
}
|
}
|
||||||
|
@ -8,23 +8,32 @@ class Scraper
|
|||||||
{
|
{
|
||||||
public $links = [];
|
public $links = [];
|
||||||
|
|
||||||
public static function scrape($method, $url, $filters, $level){
|
public static function scrape($method, $url, $filters, $level, $download_dir){
|
||||||
$client = new Client();
|
$client = new Client();
|
||||||
|
|
||||||
$crawler = $client->request($method, $url);
|
$crawler = $client->request($method, $url);
|
||||||
|
|
||||||
|
if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
|
||||||
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
|
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
|
||||||
|
|
||||||
// 1) Lista atti
|
|
||||||
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
|
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
|
||||||
$new_url = $node->link()->getUri();
|
$new_url = $node->link()->getUri();
|
||||||
$new_level = $level+1;
|
$new_level = $level+1;
|
||||||
|
|
||||||
//print str_pad('['.$new_level.']', $new_level*2, ' ', STR_PAD_RIGHT).' '.$new_url."\n";
|
|
||||||
|
|
||||||
if ($new_level < count($filters)) {
|
if ($new_level < count($filters)) {
|
||||||
self::scrape($method, $new_url, $filters, $new_level);
|
self::scrape($method, $new_url, $filters, $new_level, $download_dir);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
} else {
|
||||||
|
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
|
||||||
|
|
||||||
|
$filename = time();
|
||||||
|
|
||||||
|
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
|
||||||
|
$filename = $m[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user