From 2ebea9e816c11c168d4e6b0741ed1980fc149cbe Mon Sep 17 00:00:00 2001 From: loviuz Date: Sat, 8 Jan 2022 20:30:33 +0100 Subject: [PATCH] Completamento scraping con dowload --- config.example.php | 18 ++++- example/pagina1/sub1.1/index.html | 3 +- example/pagina1/sub1.1/test.pdf | Bin 0 -> 25626 bytes scrape.php | 26 ++++++- src/Scraping/Scraper.php | 116 ++++++++++++++++++++++-------- 5 files changed, 129 insertions(+), 34 deletions(-) create mode 100644 example/pagina1/sub1.1/test.pdf diff --git a/config.example.php b/config.example.php index c74fbd3..bfb9949 100644 --- a/config.example.php +++ b/config.example.php @@ -1,10 +1,24 @@ estensione con cui salvare +// i file +$allowedMimetypes = [ + 'application/pdf' => 'pdf' +]; + +// Directory dove salvare i file trovati $download_dir = __DIR__.'/pdf'; diff --git a/example/pagina1/sub1.1/index.html b/example/pagina1/sub1.1/index.html index 990f126..fad5e0f 100644 --- a/example/pagina1/sub1.1/index.html +++ b/example/pagina1/sub1.1/index.html @@ -1,2 +1,3 @@ Subsublink 1
-Subsublink 2
\ No newline at end of file +Subsublink 2
+Document
\ No newline at end of file diff --git a/example/pagina1/sub1.1/test.pdf b/example/pagina1/sub1.1/test.pdf new file mode 100644 index 0000000000000000000000000000000000000000..64a87539b0af606d3354e33443e4b826723539a6 GIT binary patch literal 25626 zcmeHwd05lO);|^jiB()KVnra;Dq2j!V%QUE5fBxlq98^#Lx4aQ2nk`QLY2BfTSZz> zpi)IygFx89xIo1MDqGl+0jz4!Uu`^Wn{?{n{)`~lz0_sp3yXFi`Z z^PMDTb{#)x4by|~hwQ2ujk^okwabbFM ze@hAp+#sAM5DB#4qxe8P5d{=LVg4&}$1ukaoVK?Dl3yPQ0O$21-Qh}X?}E?HGznizWsMvvVb6=x=_k_ABed*;wYZ%Me~Llf|8R6 zpoNB~LJ`(PcN+d69!K%OqtTE68WrzOf&|@09W&YVMEBFtX-K%-FUUtbMPYK=G#`I| z>Vp5GD|Dk9>s4WlV&*r<@oZ7N;@+|#hCxn6wcBy7W>~NGh0PtCZp*K}(jL|HnSAG>oZ*bk35?(T(0*x@}0?q5@apjcgS8)<@s8t>P4hJc$?JedYH zkaie?IEfFS(5X0l0C2@|Dg|ecr(qFbOrZ!{63}7^DxATb_rc?6P~)W_tI709UfMOw1Lr+4Sp+Ey2*r4~Ps(K%)uGF-2 zr@0d;Uhh?`*lUNUxqE;+P?NtjEwygBX~an*h>`T}N4unNS8m$V&(ownTI!y(D?|#_ z-p?I}M;ycl5^#7Ms(Ua5;YjcRpwQoMtP4RL!V|o_fy0ox#t_8U1X{pxJXMNe3R&t$ zP-Y9fLG`0ZF$1Mh>3GNrj{XcI2v9GD6YxH1+1Frra5M@Ps)t;%f=(g_EH#KmzsID$ zzR7-FH2MSlMqBh&$;qvG4CQN`^&S1@Z=>tvy;KE;aA#;KZ2{IPQ zSRiA8j0G|l$XFm_fs6$*7RXp2V}XnXG8V{KAY*}y1u_=MSRiA8j0G|l$XFm_fs6$* z7W{9spn-emF$vu|r&TpN@>8DO+D#fwgU9Pn9Da0rx6Lk-v-Je}nqBm-?>N}*J$%U4 z>Qup+dvC66hpoQ@*_*auTD#*@&rkDpUq)6tjPVM&t)kLg(L&NdzD>xCi)eZ-tV}R6 zUhER*JG<~i*--8qzo>O`(A#eEa%;3Xa&rI4Mc=r<%12^MKgR;)wYRH?ODY26vsV_Brx2 zKWHpm)MZBDc9sP6cQP7K0b?l^kz?hdUc45LUl+3VjMoBvcTSgt@ znJvnO-iW(8mn2^VH!^qzp|mtsWLE&oTPXB?hG2PV&Ab~Flno^Xc2aLrz8B8Qal)zw z;sU8d*{75Z2d}tA3wM;B=U{$}G@p!KINH$vecFIU926Sv|LetOr2nlrwa^HKMZEtNf>@+Fcv5$jzItk@;l zG%)&l+Jv;$LN3rGq~iq7k;(R3vpf`&U(7yM7VI;8FKi{*L{#sI2~#S|JIgI&v-e3y z$zcYe*EhQN>DIYaN=3nQtEcUoiMrdP*0o<9yy9+d>mu%qn#>S<=P4BxBlKoNox>A} z8`v$zJsN4VV;4U5uiJJaRwIWLFmjc_Ax2-`+;RqgQT3Kciz=N)1C?!}`Dt;A zGk1O??ki35u1-j3Nb%k<<8|f7?EaTy)5qaA@IVk$cn{QFRtGC z!WYZEM9v-=oxG-$yl&!q)PKxQoESCn7Jv}5wL=pnVc>Qv+&K(hQ=*PcIAa$on?sPX6{>k`5O13N4$ zXRg(^Zpps3@OYz!z+jFw0-rk{lX-jcJ#`T8;-u)Y!R&C9%4Y=8*h_ zMNrU%G?rdyS-+YwM?$i{H8QqlEhSlKQDZ)Vu;XszH@ijTq15YK&yS@Bw=ON246Tb%w`+bGH>K97P1H}cWh&2M@}8qlc{#=)9MCURS(@#5PpMdjjx(~3rc4@Y9>&b{mq zTKtmm^3__yt((}2R?Ry&TgDcj4)>I;cO`vqVD|6;6S>8uoN*PgDy$DFzrCy}YVG>c zUz$lZnw@z;<kuXTQIM!?IhQ2QY6 zyfF2HP8BD@ud(!Wrkn4%@&I~vml5(RT5hasVAS(vB)2#-6rck{VXI~+C!(b5xA>7z zxs4j{xtZ{73N!EXt z7iT>7YWG#`^6aEC$Okj(Lx+Bg58eGK`D~HT7UK1as&h`Ie<(z2%`wXur5UBh-3=o& zf5(ryi$vQ?sIBeMNiEr>3VllBGD;eRqj2Y%f)Q(EFd?- zIQLKID$w7~DOw0Ou3c|_ue5lBWT z>RKrb4=|nc>VAvIseHC<#5SeKM~|i+UjaRQa37IZXp;Y%T+t`A1Iu0>(Qypn34e@N zG|D+rxR!+}&wsQ(w2mFOki>}0Di!8ym~!owrn@~EifqGhDt_-DmP8=f;E>Daio+Pu zL2~gwbZaD}ONvdqnw+TS@evMyqj_jBQizI#p81M3HAh?T(u1Vbc-O_WdSAmkm&+~y8X zmnvrG?$P^6>G5m9!+m@zdqQLEixs-WxRv3fhuZ@O3d5$oh)rV1HhMg5qEx66yh8wVK0 zH&(`4BPx5muywyv*jx-Yc^7dX&>w3~d|Ugi)5~=c#2D26V_`>P=9X71(nw0~ENB}} zLbr_HRFOYb=GHOya;M|{WD7-{R_~KxWR1&w#d^`k6?9lJk=!7k{AQ!=trO<%(T+vH zE37i3l>IY#ar8;+=E4?U*w#UZZ`MnrLY&)N@T7{q9coBR(Uf%u08b+>b6|adE|JR22ULs)|PopZM8Kg+^AVuVw94yCfD@sYHyfNiliwXezp^ooBUz z4^4K<%+N6y*_a@>#q-#n1$4({IMc>sqqWSwzM+z*ArT50?~S%)!oskb4fg!$*L6x} zr5Azl9Qv$NX<$A(PNO>xlO1bXvEaNi=+bDdUX?fjvw=~`qDzh%ygcVO;kEs~C9Gz@&vMwt7Mfw7UOL>RWiM9CkQB$|?-S^Xv zGOPM`6oMSy`)#iz`{_{W;Y>$nSB3wy#_JNA`S3jegzu%G{zD=-wV#`Y$970qw$ z*p#HXa96LQ*t;Ss+g2OtnPv7a@3f>W96yvKX%W5;^GFEG6Vo3_DH93wFQv~sIL)lP z6Jm?!r(#sXh?`qaS50H8aV0)*GXfqd_xrI-+R@6GAyOj}M?xq-GTq&<`Nlr7*%&g%R!fPzi zh827{GGXHO5z~d=)(g0V={(Ni>rtd`v$$M%#31uP9BHhN2ZC?kFpIHLVOyG;dd0&5 z(Nr(#SdH*I60hjUh(YBRdoW`OFM0RY4qtAAfNHOLzdSPOs9EpuO`sH*0JY~jpZ9T% zEJj^BDpDK|hDY(=3>Z3f&NcNZnX@k97Rw&(59sk@eNrHuJljDx7?hldw7m0AUWXEO zGx)S@Lv~AL~?#Tj&NuYy*aPdM&HkSqsHV(lYB;RTotzjWzG$ z+~)_AZZsEZ@PrDiRSs`Tn)7qII$0HK(OcBlVZM5Wl+IkboNa6CeT2VcX6%&U8k6I? z3KLg%^o4eI5@#>jDVnhgCmDQygZFdvr!4E<#HS5o!vnDq3!AO|r*fKha5Rm}g2k+| zYu*>~`11|$_?OiP2%tIC~bdgtQ|DgeNU@2HB)O|BtnJX+}Zw_fNeP0^EB7TztL;`_D)W0 zLyRD_)_h>r;RBq~SQa8E`hyT$wW1@7l|7J9D!zoSR4g&fl9HRWuP%-U#Wo%>{X z<6PMNNd+e*HKHTY-f0u@`$NFz^!30S^@8$>=Vc*rg>}juh}N2)JgX^8Sn*J+t3}at zojrWC=jyiTwJcXu<|Eql&1BXamG_P}<7~VB84vwyrpMenEF^N)?VMsq&(B4?pxM^@ zg=T+zdo>f`KW)ZRv&$*-<9nh=_q39~N%fV6=TNYSY>cFN%k=Rzz*s7 zTL3dc5wpJsJI`~}T=P%z(iV3kIbsml($um0IB;=7?HfCF;YVNjIIUq>w_cNMOGu95!K4wZsL@J??q#^cz+rW{7%lTnU}L#!!z-(ozW5%{3Yb@XT+q)dm+cJ z!)vyzWeLVkI3-rPV6a0m+R(v7Fb7ue;aowvin^0iv=}F1wA!XRBxiiiu8StcVNir7 z+*Op^u}Dk&(3tI?TypkORQBs)iR2hMZ2@z%IJL=9Wmt_US>Rqy)54CfdiNq)=g6eP z5J6|*-RjlOn~BGp^i_%1tMBr{yPpXy{9o7OcY_6up*f*Qb#AM)0yoV%hsYpMKALqa z&kZ|qb8fB?-6m8jHKSPBAlI@iRro!$5>oY+P`UETiETqk?XLR`XY$q*N^y^eb8dan zflQH{N=njpoiEsA4}w=mSlB?{oH}TON$-QCYlSy_zruOEu2eieMSOAuTO{UMD|j~T z;oJ|Kx#%l2V>QXUjSa`&vlo2^A_@yIr0}RBHZAX6g3Thc?|NBso6t9=>!BT!B>q7y zyII0Hsqzih@WHc8EW*Msu&$%7sW5k?Z^?GQ~yK$ zkBn@cBNiR*#@JQuo233x2mZ-UnLD1DWLoascpa=ZE(sD#XQ}w=TOo4ij0>f3mIfc` z{PNNdJU+P2-*-s3SQVHbp@JTZCKWl7^V@u$OQrXlP{yv&WAar~VE}Qkmw;=Xi+R0C zJZ8<$nZn>fe+^<;h2P6`vTea1}5BRdSem1V( zD6BU=Tr>E#V#nLAl*$*o)w7+$h`)fPmqsL-I`j(jsh7@Ip@1f5tD^xXrsC~#l`Xn4 zBO*p;liQp?9_*^MjD~u4A#uX+e6KFk&=->Cx|TZy)rlh*7q2(ZY)5jsB(1Q-SlEE=J6nzqqJ+=SDy-`36hcT;+zru)fccPZXc&3qRdtR8Fz zJp?o8lzctuk!g3eFa<9-$Xs`Rlp%VsZ^8Yl)MFUnF)3UpALl1#34?u9*oNQb*;Ew0 zp8&Kuse9P0XI&JZlSIVgF;}gDO+@*vW!~wvjP)+qlCAR`qQc-;QYEB^SkcUB`nWiK zD&P>?@S8j%)Wb(jLuN|7W58?R>0u9Bn5x5im~&_PN<{1)j-sw`-dX2Jm3RQyv8;ms zxpbvdAW##Vq`Ij;=BR`fgVlhkQtx=)nrY~ghVRQHvzKbyD}1N?zhXN?jCN}C^r(c~ zfJv|=`c%9XvG+BXV_t$-%6fF`>*nt5(F zl5DbwlS%0H+ML626@2C9La?X?%DtAAw@YsRptDw7Y#i^VgAyK*I5`dXXuKYuJ@f6h z4NK-V)$ov1{9>+=K?t0k z+UFS*eY$oDs3_~yQo5C+WQcYD~zMbJR;?_N})P|XG z%IH+<;>p?b0Lvx%sP)tC$)LxC1swB=RuZMCCaQASQz{H&V;d_U)7AS#y-p|8jT31`#&jI7sFId zmC}0?QKf7?#X6^a_SXmiE6DYVk?UYVwNUGO2}md{OE`rc-_=k#UK_D~52yB8#oS|q zq^z;P`e9s^YO30@NXf*{U=4w)NRfxIcHq3`TU!%hjJfO4KZ1P>uxlh`P)V*S5Tg$HxL~jSwPLvB|cvsVN{kpVhxyd-8odH z%5D*GIxO1XMhTAG;k|uwH@XY*%4`X5yDC_%r<O4bM*t)dV{G3xM`7vgZQHw3& z-7Tsw71U5F(}vbLlDh==S9c6nb3tEw@#VGt%es*-uI0N@tI2r9^acblX~~# zCdP}?kZp4UX4~88-36sJ6-6$L7wOpM_DznMAp*rtnN`CPs*YgJ(BtDBTqaWD9nKM+ zD`merQ|#TnFxk&#h^9KgMpKuLuSzg~d?cO&KB&->ZtIL|5hqhRdcy0f*PEg7U6(iS z&MK*e=;Sb;K(4C6&ggp8hZ~T~h8|H^Da|6Ig9)36_cRvk>vNL~KVNTqUP zA-U~JJi2Lj2eostn8+3 z6R=q`>%4AmpX^GQqPdS7uEy%cl;dtDXoP;h?7F5P6_$GPNx$EkHlIx~|J5-n`}>D6 z7U{WaA4~{kh6f9%2Hd%N+rIU?IafIEiW9P0cpF!qimQW}U6OLw_(UeZCfW7v$S*3J zi3&&2o}$7z6o5DQtmJr;u?p*XmE?^-UNKDA?Ccs{pc9o%vFGMvFXYOt!1Vg08@A_? zmS)#%hV`B8E=ZsMI-A0}| zP=XgdFLG1g?;~0=Vg4dYWXKzhG_#4Oa7Nht;8@!&H}yj2#f6mgEdAnau+OnsN+_@i zpf@1SjF>5OQ7Ic@n^8{m*DCH*W}Rpw$9b8P(!L&mlk>nPG45;TDp*(eQ)Jcr#3(s3 zr0OAt=h1wC?A1oEt84V?G{K2sj`W2Z@BM+1W1i}ysp7kNil4^0$+ss)&Sgsk4UG}6 z7T3K&v3NbU7T2xQ|Z8^`~yFUH6aEkm>n(fPMr0Y0 zWkmij`qHi}q{%{>ETqXonk=OK$Nc8++8dM){0@s7-8SI04Qr$^-28HSi z)q@*q1C>+{Pt%hiSNO6*@D8QzO{4jlA`lD)1I{pjQ>b1DJrffXgswh9Umpe}U;)8o zntKq89Iz9#w`2(6%!g{^0Qizka2$n%Ae=$y!F3Vu8#)NOLnZi0Gs{Dz4ZEM0W`N(Z zV?_%{4*$>9fEz%M0q7pMG6gx#J?J<*6{T%+{3z7Mn-V}!3tXV92h%k^rDtNQr)R2bsG|%1>Y5||A_)cy zRJ8z;ELOlhB5d9DRL>d@C_xJ_qFXhtTyWIGahLsBD2&sGj)c=3e zBkq27cK?5$kVsOHN##rrQgCz<$e(U|5T#A06Ff}u$n(Y?#>RMyDdiD#FJ0EvK;5g#auhCu&^UjK87KE@>w>`>YaY1H|Lu&4hq?7e5& zo__$~64Ca8toKrY13F6~zUQE+lpnf!y2g4YdU{LL(AL$}24a9h|3YE~z5m#f6zbVrcW{!(UO45om;l<64*|NN!Qzz7`A@Ru?}BXF9*U(5FYRiD0rvB978(Wvg= z3=Aqb(*laHCxqZZ-=K(-6bd+IL3$VhIM9LYNr6fS2ps!hdk~5>x3o02{Mx`6X@xv^ z&=_fKq-%A+;DEKRiOE3|11l4x3Ho14I7?4(2%x!BX-f{(N9r5tgF`^9FxHU&2MY{k AQUCw| literal 0 HcmV?d00001 diff --git a/scrape.php b/scrape.php index 963f8b5..cfc561c 100644 --- a/scrape.php +++ b/scrape.php @@ -1,10 +1,32 @@ \n"); +} else { + require __DIR__.'/'.$argv[1]; +} + +$client = new Client(); $scraper = new Scraper(); -$scraper->scrape('GET', $start_urls, $filters, 0, $download_dir); +// Configure object +$scraper->allowedMimetypes = $allowedMimetypes; +$scraper->link_rules = $link_rules; + +$scraper->scrape($start_urls, 0); + +// Download documents +foreach ($scraper->results as $url => $scraped_obj) { + if ($scraped_obj['is-downloadable']) { + $client->request('GET', $url); + + print '[*] Downloading '.$scraped_obj['filename']."\n"; + file_put_contents( $download_dir.'/'.$scraped_obj['filename'], $client->getResponse()->getContent() ); + } +} \ No newline at end of file diff --git a/src/Scraping/Scraper.php b/src/Scraping/Scraper.php index 2aefcdd..37874dc 100644 --- a/src/Scraping/Scraper.php +++ b/src/Scraping/Scraper.php @@ -6,45 +6,103 @@ use \Goutte\Client; class Scraper { - public $links = []; + public $results = []; + public $allowedMimetypes; + public $link_rules; - public static function scrape($method, $urls, $filters, $level, $download_dir){ + /** + * Scrapes specified URLs + * + * @param array $urls + * @param int $level + */ + public function scrape(array $urls, int $level){ $client = new Client(); $sub_urls = []; foreach ($urls as $url) { - $crawler = $client->request($method, $url); + $crawler = $client->request('GET', $url); - //if (strstr($client->getResponse()->getHeaders()['content-type'][0], 'text/html')) { - print '['.$level.'] '.$url."\n"; - - $new_urls = $crawler->filterXPath($filters[$level])->each(function ($node) use ($sub_urls) { - return $node->link()->getUri(); - }); - - $sub_urls = array_merge( $sub_urls, $new_urls ); - /* - } else { - $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; - - $filename = time(); - - if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { - $filename = $m[1]; - } - - file_put_contents( $download_dir.'/'.$filename, $client->getResponse()->getContent() ); + if( $url == 'http://localhost/example/pagina1/sub1.1/test.pdf' ){ + $a = 1; } - */ + + $scraped_obj[$url]['content-type'] = $this->getContentType($client); + $scraped_obj[$url]['is-downloadable'] = $this->isDownloadable($client); + $scraped_obj[$url]['filename'] = ( $scraped_obj[$url]['is-downloadable'] ? $this->getFilename($client) : '' ); + + $this->results = array_merge( $this->results, $scraped_obj ); + + $new_urls = $crawler->filterXPath($this->link_rules[$level])->each(function ($node) { + return $node->link()->getUri(); + }); + + $sub_urls = array_merge( $sub_urls, $new_urls ); } - if($level==3){ - $a=1; - } - - if ($level++ < count($filters) && !empty($sub_urls)) { - self::scrape($method, $sub_urls, $filters, $level, $download_dir); + if ($level++ < count($this->link_rules) && !empty($sub_urls)) { + $this->scrape($sub_urls, $level); } } + + /** + * Returns if URL is downloadable + * + * @return boolean + * + * @param Client $client + */ + public function isDownloadable(Client $client){ + foreach ($this->allowedMimetypes as $mimetype) { + if (isset($client->getResponse()->getHeaders()['content-type'][0])) { + if (strstr($client->getResponse()->getHeaders()['content-type'][0], $mimetype)) { + return true; + } + } + } + + return false; + } + + /** + * Get filename of http response based on URL or content-disposition header + * + * @return string + * + * @param Client $client + */ + public function getFilename(Client $client){ + $filename = basename( $client->getRequest()->getUri() ); + + // Try to get filename from content-disposition + if (isset($client->getResponse()->getHeaders()['content-disposition'][0])) { + $content_disposition = $client->getResponse()->getHeaders()['content-disposition'][0]; + + if (preg_match('/filename="([^"]+)"/', $content_disposition, $m)) { + $filename = $m[1].'-'.$this->allowedMimetypes[ $this->getContentType($client) ]; + } else { + $filename = time(); + } + } + + return $filename; + } + + /** + * Get content type of http response + * + * @return string + * + * @param Client $client + */ + public static function getContentType(Client $client){ + $content_type = ''; + + if (isset($client->getResponse()->getHeaders()['content-type'][0])) { + $content_type = $client->getResponse()->getHeaders()['content-type'][0]; + } + + return $content_type; + } } \ No newline at end of file