From 9db2ec22d90078b52182fcd36a1bc7239b1dd93c Mon Sep 17 00:00:00 2001 From: loviuz Date: Thu, 6 Jan 2022 17:20:40 +0100 Subject: [PATCH] Primo commit --- composer.json | 10 ++++++++++ scrape.php | 20 ++++++++++++++++++++ src/Scraping/Scraper.php | 30 ++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 composer.json create mode 100644 scrape.php create mode 100644 src/Scraping/Scraper.php diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..2b67faf --- /dev/null +++ b/composer.json @@ -0,0 +1,10 @@ +{ + "require": { + "fabpot/goutte": "^4.0" + }, + "autoload": { + "psr-4": { + "": "src/" + } + } +} \ No newline at end of file diff --git a/scrape.php b/scrape.php new file mode 100644 index 0000000..e90cb20 --- /dev/null +++ b/scrape.php @@ -0,0 +1,20 @@ +levels = count($filters); + +foreach ($filters as $i => $filter) { + $scraper->scrape('GET', $start_url, $filters, $i); +} diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php new file mode 100644 index 0000000..73b098d --- /dev/null +++ b/src/Scraping/Scraper.php @@ -0,0 +1,30 @@ +request($method, $url); + + print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n"; + + // 1) Lista atti + $crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) { + $new_url = $node->link()->getUri(); + $new_level = $level+1; + + //print str_pad('['.$new_level.']', $new_level*2, ' ', STR_PAD_RIGHT).' '.$new_url."\n"; + + if ($new_level < count($filters)) { + self::scrape($method, $new_url, $filters, $new_level); + } + }); + } +} \ No newline at end of file