structured_scraper/src/Scraping/Scraper.php

50 lines
1.4 KiB
PHP

<?php
namespace Scraping;
use \Goutte\Client;
class Scraper
{
public $links = [];
public static function scrape($method, $urls, $filters, $level, $download_dir){
$client = new Client();
$sub_urls = [];
foreach ($urls as $url) {
$crawler = $client->request($method, $url);
//if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) {
print '['.$level.'] '.$url."\n";
$new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) {
return $node->link()->getUri();
});
$sub_urls = array_merge( $sub_urls, $new_urls );
/*
} else {
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
$filename = time();
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
$filename = $m[1];
}
file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() );
}
*/
}
if($level==3){
$a=1;
}
if ($level++ < count($filters) && !empty($sub_urls)) {
self::scrape($method, $sub_urls, $filters, $level, $download_dir);
}
}
}