Primo commit

This commit is contained in:
loviuz 2022-01-06 17:20:40 +01:00
parent f484a21347
commit 9db2ec22d9
3 changed files with 60 additions and 0 deletions

10
composer.json Normal file
View File

@ -0,0 +1,10 @@
{
"require": {
"fabpot/goutte": "^4.0"
},
"autoload": {
"psr-4": {
"": "src/"
}
}
}

20
scrape.php Normal file
View File

@ -0,0 +1,20 @@
<?php
require __DIR__.'/vendor/autoload.php';
use Scraping\Scraper;
$start_url = 'https://este.trasparenza-valutazione-merito.it/web/amministrazionetrasparente/papca-ap/-/papca/igrid/1452834656/1452816457';
$filters = [
'//td[@class="actions"]/a[@title="Visualizza Allegato"]',
'//table[@class="allegati"]/a[@title="Download versione non firmata"]'
];
$scraper = new Scraper();
$scraper->levels = count($filters);
foreach ($filters as $i => $filter) {
$scraper->scrape('GET', $start_url, $filters, $i);
}

30
src/Scraping/Scraper.php Normal file
View File

@ -0,0 +1,30 @@
<?php
namespace Scraping;
use \Goutte\Client;
class Scraper
{
public $links = [];
public static function scrape($method, $url, $filters, $level){
$client = new Client();
$crawler = $client->request($method, $url);
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
// 1) Lista atti
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
$new_url = $node->link()->getUri();
$new_level = $level+1;
//print str_pad('['.$new_level.']', $new_level*2, ' ', STR_PAD_RIGHT).' '.$new_url."\n";
if ($new_level < count($filters)) {
self::scrape($method, $new_url, $filters, $new_level);
}
});
}
}