request('GET', $url); if ($this->debug) { print '['.$level.'] '.$url."\n"; } $scraped_obj[$url]['content-type'] = $this->getContentType($client); $scraped_obj[$url]['is-downloadable'] = $this->isDownloadable($client); $scraped_obj[$url]['filename'] = ( $scraped_obj[$url]['is-downloadable'] ? $this->getFilename($client) : '' ); $this->results = array_merge( $this->results, $scraped_obj ); $new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) { return $node->link()->getUri(); }); $sub_urls = array_merge( $sub_urls, $new_urls ); } if ($level++ < count($this->link_rules) && !empty($sub_urls)) { $this->scrape($sub_urls, $level); } } /** * Returns if URL is downloadable * * @return boolean * * @param Client $client */ public function isDownloadable(Client $client){ foreach ($this->allowedMimetypes as $mimetype) { if (isset($client->getResponse()->getHeaders()['content-type'][0])) { if (strstr($client->getResponse()->getHeaders()['content-type'][0], $mimetype)) { return true; } } } return false; } /** * Get filename of http response based on URL or content-disposition header * * @return string * * @param Client $client */ public function getFilename(Client $client){ $filename = basename( $client->getRequest()->getUri() ); // Try to get filename from content-disposition if (isset($client->getResponse()->getHeaders()['content-disposition'][0])) { $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { $filename = $m[1].'-'.$this->allowedMimetypes[ $this->getContentType($client) ]; } else { $filename = time(); } } return $filename; } /** * Get content type of http response * * @return string * * @param Client $client */ public static function getContentType(Client $client){ $content_type = ''; if (isset($client->getResponse()->getHeaders()['content-type'][0])) { $content_type = $client->getResponse()->getHeaders()['content-type'][0]; } return $content_type; } }