113 lines
3.1 KiB
PHP
113 lines
3.1 KiB
PHP
<?php
|
|
|
|
namespace Scraping;
|
|
|
|
use \Goutte\Client;
|
|
|
|
class Scraper
|
|
{
|
|
public $results = [];
|
|
public $allowedMimetypes;
|
|
public $link_rules;
|
|
public $debug = false;
|
|
|
|
/**
|
|
* Scrapes specified URLs
|
|
*
|
|
* @param array $urls
|
|
* @param int $level
|
|
*/
|
|
public function scrape(array $urls, int $level){
|
|
$client = new Client();
|
|
|
|
$sub_urls = [];
|
|
|
|
foreach ($urls as $url) {
|
|
$crawler = $client->request('GET', $url);
|
|
|
|
if ($this->debug) {
|
|
print '['.$level.'] '.$url."\n";
|
|
}
|
|
|
|
$scraped_obj[$url]['content-type'] = $this->getContentType($client);
|
|
$scraped_obj[$url]['is-downloadable'] = $this->isDownloadable($client);
|
|
$scraped_obj[$url]['filename'] = ( $scraped_obj[$url]['is-downloadable'] ? $this->getFilename($client) : '' );
|
|
|
|
$this->results = array_merge( $this->results, $scraped_obj );
|
|
|
|
$new_urls = [];
|
|
|
|
if (isset($this->link_rules[$level])) {
|
|
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
|
|
return $node->link()->getUri();
|
|
});
|
|
}
|
|
|
|
$sub_urls = array_merge( $sub_urls, $new_urls );
|
|
}
|
|
|
|
if ($level++ < count($this->link_rules) && !empty($sub_urls)) {
|
|
$this->scrape($sub_urls, $level);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns if URL is downloadable
|
|
*
|
|
* @return boolean
|
|
*
|
|
* @param Client $client
|
|
*/
|
|
public function isDownloadable(Client $client){
|
|
foreach ($this->allowedMimetypes as $mimetype) {
|
|
if (isset($client->getResponse()->getHeaders()['content-type'][0])) {
|
|
if (strstr($client->getResponse()->getHeaders()['content-type'][0], $mimetype)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Get filename of http response based on URL or content-disposition header
|
|
*
|
|
* @return string
|
|
*
|
|
* @param Client $client
|
|
*/
|
|
public function getFilename(Client $client){
|
|
$filename = basename( $client->getRequest()->getUri() );
|
|
|
|
// Try to get filename from content-disposition
|
|
if (isset($client->getResponse()->getHeaders()['content-disposition'][0])) {
|
|
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
|
|
|
|
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
|
|
$filename = $m[1].'-'.$this->allowedMimetypes[ $this->getContentType($client) ];
|
|
} else {
|
|
$filename = time();
|
|
}
|
|
}
|
|
|
|
return $filename;
|
|
}
|
|
|
|
/**
|
|
* Get content type of http response
|
|
*
|
|
* @return string
|
|
*
|
|
* @param Client $client
|
|
*/
|
|
public static function getContentType(Client $client){
|
|
$content_type = '';
|
|
|
|
if (isset($client->getResponse()->getHeaders()['content-type'][0])) {
|
|
$content_type = explode( ';', $client->getResponse()->getHeaders()['content-type'][0] ) [0];
|
|
}
|
|
|
|
return $content_type;
|
|
}
|
|
} |