Primo commit
This commit is contained in:
parent
f484a21347
commit
9db2ec22d9
10
composer.json
Normal file
10
composer.json
Normal file
@ -0,0 +1,10 @@
|
||||
{
|
||||
"require": {
|
||||
"fabpot/goutte": "^4.0"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"": "src/"
|
||||
}
|
||||
}
|
||||
}
|
20
scrape.php
Normal file
20
scrape.php
Normal file
@ -0,0 +1,20 @@
|
||||
<?php
|
||||
|
||||
require __DIR__.'/vendor/autoload.php';
|
||||
|
||||
use Scraping\Scraper;
|
||||
|
||||
$start_url = 'https://este.trasparenza-valutazione-merito.it/web/amministrazionetrasparente/papca-ap/-/papca/igrid/1452834656/1452816457';
|
||||
|
||||
$filters = [
|
||||
'//td[@class="actions"]/a[@title="Visualizza Allegato"]',
|
||||
'//table[@class="allegati"]/a[@title="Download versione non firmata"]'
|
||||
];
|
||||
|
||||
|
||||
$scraper = new Scraper();
|
||||
$scraper->levels = count($filters);
|
||||
|
||||
foreach ($filters as $i => $filter) {
|
||||
$scraper->scrape('GET', $start_url, $filters, $i);
|
||||
}
|
30
src/Scraping/Scraper.php
Normal file
30
src/Scraping/Scraper.php
Normal file
@ -0,0 +1,30 @@
|
||||
<?php
|
||||
|
||||
namespace Scraping;
|
||||
|
||||
use \Goutte\Client;
|
||||
|
||||
class Scraper
|
||||
{
|
||||
public $links = [];
|
||||
|
||||
public static function scrape($method, $url, $filters, $level){
|
||||
$client = new Client();
|
||||
|
||||
$crawler = $client->request($method, $url);
|
||||
|
||||
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
|
||||
|
||||
// 1) Lista atti
|
||||
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
|
||||
$new_url = $node->link()->getUri();
|
||||
$new_level = $level+1;
|
||||
|
||||
//print str_pad('['.$new_level.']', $new_level*2, ' ', STR_PAD_RIGHT).' '.$new_url."\n";
|
||||
|
||||
if ($new_level < count($filters)) {
|
||||
self::scrape($method, $new_url, $filters, $new_level);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user