structured_scraper/src/Scraping/Scraper.php

39 lines
1.2 KiB
PHP
Raw Normal View History

2022-01-06 17:20:40 +01:00
<?php
namespace Scraping;
use \Goutte\Client;
class Scraper
{
public $links = [];
2022-01-08 14:28:07 +01:00
public static function scrape($method, $url, $filters, $level, $download_dir){
2022-01-06 17:20:40 +01:00
$client = new Client();
$crawler = $client->request($method, $url);
2022-01-08 14:28:07 +01:00
if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
print str_pad('['.$level.']', $level*2, ' ', STR_PAD_LEFT).' '.$url."\n";
2022-01-06 17:20:40 +01:00
2022-01-08 14:28:07 +01:00
$crawler->filterXPath($filters[$level])->each(function ($node) use ($method, $filters, $level) {
$new_url = $node->link()->getUri();
$new_level = $level+1;
2022-01-06 17:20:40 +01:00
2022-01-08 14:28:07 +01:00
if ($new_level < count($filters)) {
self::scrape($method, $new_url, $filters, $new_level, $download_dir);
}
});
} else {
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
2022-01-06 17:20:40 +01:00
2022-01-08 14:28:07 +01:00
$filename = time();
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
$filename = $m[1];
2022-01-06 17:20:40 +01:00
}
2022-01-08 14:28:07 +01:00
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
}
2022-01-06 17:20:40 +01:00
}
}