structured_scraper/src/Scraping/Scraper.php

108 lines
3.0 KiB
PHP
Raw Normal View History

2022-01-06 17:20:40 +01:00
<?php
namespace Scraping;
use \Goutte\Client;
class Scraper
{
2022-01-08 20:30:33 +01:00
public $results = [];
public $allowedMimetypes;
public $link_rules;
2022-01-06 17:20:40 +01:00
2022-01-08 20:30:33 +01:00
/**
* Scrapes specified URLs
*
* @param array $urls
* @param int $level
*/
public function scrape(array $urls, int $level){
2022-01-06 17:20:40 +01:00
$client = new Client();
2022-01-08 15:54:40 +01:00
$sub_urls = [];
2022-01-06 17:20:40 +01:00
2022-01-08 15:54:40 +01:00
foreach ($urls as $url) {
2022-01-08 20:30:33 +01:00
$crawler = $client->request('GET', $url);
2022-01-06 17:20:40 +01:00
2022-01-08 20:30:33 +01:00
if( $url == 'http://localhost/example/pagina1/sub1.1/test.pdf' ){
$a = 1;
}
2022-01-06 17:20:40 +01:00
2022-01-08 20:30:33 +01:00
$scraped_obj[$url]['content-type'] = $this->getContentType($client);
$scraped_obj[$url]['is-downloadable'] = $this->isDownloadable($client);
$scraped_obj[$url]['filename'] = ( $scraped_obj[$url]['is-downloadable'] ? $this->getFilename($client) : '' );
2022-01-08 15:54:40 +01:00
2022-01-08 20:30:33 +01:00
$this->results = array_merge( $this->results, $scraped_obj );
2022-01-08 15:54:40 +01:00
2022-01-08 20:30:33 +01:00
$new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) {
return $node->link()->getUri();
});
$sub_urls = array_merge( $sub_urls, $new_urls );
}
2022-01-08 15:54:40 +01:00
2022-01-08 20:30:33 +01:00
if ($level++ < count($this->link_rules) && !empty($sub_urls)) {
$this->scrape($sub_urls, $level);
}
}
/**
* Returns if URL is downloadable
*
* @return boolean
*
* @param Client $client
*/
public function isDownloadable(Client $client){
foreach ($this->allowedMimetypes as $mimetype) {
if (isset($client->getResponse()->getHeaders()['content-type'][0])) {
if (strstr($client->getResponse()->getHeaders()['content-type'][0], $mimetype)) {
return true;
}
2022-01-06 17:20:40 +01:00
}
2022-01-08 15:54:40 +01:00
}
2022-01-08 20:30:33 +01:00
return false;
}
/**
* Get filename of http response based on URL or content-disposition header
*
* @return string
*
* @param Client $client
*/
public function getFilename(Client $client){
$filename = basename( $client->getRequest()->getUri() );
// Try to get filename from content-disposition
if (isset($client->getResponse()->getHeaders()['content-disposition'][0])) {
$content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0];
if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) {
$filename = $m[1].'-'.$this->allowedMimetypes[ $this->getContentType($client) ];
} else {
$filename = time();
}
2022-01-08 15:54:40 +01:00
}
2022-01-08 14:28:07 +01:00
2022-01-08 20:30:33 +01:00
return $filename;
}
/**
* Get content type of http response
*
* @return string
*
* @param Client $client
*/
public static function getContentType(Client $client){
$content_type = '';
if (isset($client->getResponse()->getHeaders()['content-type'][0])) {
$content_type = $client->getResponse()->getHeaders()['content-type'][0];
2022-01-08 14:28:07 +01:00
}
2022-01-08 20:30:33 +01:00
return $content_type;
2022-01-06 17:20:40 +01:00
}
}