mirror of
				https://github.com/johnwhitington/cpdf-source.git
				synced 2025-06-05 22:09:39 +02:00 
			
		
		
		
	JBIG2 extraction done
This commit is contained in:
		
							
								
								
									
										1
									
								
								Changes
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Changes
									
									
									
									
									
								
							| @@ -17,6 +17,7 @@ Extended features: | |||||||
|  |  | ||||||
| o -list-images-used[-json] extends -image-resolution | o -list-images-used[-json] extends -image-resolution | ||||||
| o Use -raw with -extract-images to get PNMs | o Use -raw with -extract-images to get PNMs | ||||||
|  | o -extract-images can extract JBIG2 images and their globals | ||||||
|  |  | ||||||
| Fixes: | Fixes: | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										34
									
								
								cpdfimage.ml
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								cpdfimage.ml
									
									
									
									
									
								
							| @@ -27,11 +27,41 @@ let write_stream name stream = | |||||||
|     Pdfio.bytes_to_output_channel fh stream; |     Pdfio.bytes_to_output_channel fh stream; | ||||||
|     close_out fh |     close_out fh | ||||||
|  |  | ||||||
|  | let jbig2_serial = ref 0 | ||||||
|  |  | ||||||
|  | let jbig2_globals = null_hash () | ||||||
|  |  | ||||||
| let write_image ~raw ?path_to_p2p ?path_to_im pdf resources name image = | let write_image ~raw ?path_to_p2p ?path_to_im pdf resources name image = | ||||||
|   match Pdfimage.get_image_24bpp pdf resources image with |   match Pdfimage.get_image_24bpp pdf resources image with | ||||||
|   | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream |   | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream | ||||||
|   | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream |   | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream | ||||||
|   | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream |   | Pdfimage.JBIG2 (stream, _, global) -> | ||||||
|  |       begin match global with | ||||||
|  |       | None -> | ||||||
|  |           Printf.printf "JBIG2: No global, writing plain\n"; | ||||||
|  |           write_stream (name ^ ".jbig2") stream | ||||||
|  |       | Some g -> | ||||||
|  |           Printf.printf "JBIG2: there is a global\n"; | ||||||
|  |           let go () = | ||||||
|  |             let serial, _ = Hashtbl.find jbig2_globals g in | ||||||
|  |               write_stream (name ^ ".jbig2__" ^ string_of_int serial) stream | ||||||
|  |           in | ||||||
|  |             try go () with Not_found -> | ||||||
|  |               jbig2_serial += 1; | ||||||
|  |               let globaldata = | ||||||
|  |                 let obj = Pdf.lookup_obj pdf g in | ||||||
|  |                   Pdfcodec.decode_pdfstream_until_unknown pdf obj; | ||||||
|  |                   match obj with | Pdf.Stream {contents = (_, Got b)} -> Some b | _ -> None | ||||||
|  |               in | ||||||
|  |                 match globaldata with | ||||||
|  |                 | Some d -> | ||||||
|  |                     Hashtbl.add jbig2_globals g (!jbig2_serial, d); | ||||||
|  |                     let filename = Filename.concat (Filename.dirname name) (string_of_int !jbig2_serial ^ ".jbig2global") in | ||||||
|  |                       write_stream filename d; | ||||||
|  |                       go () | ||||||
|  |                 | None -> | ||||||
|  |                     Pdfe.log "Could not extract JBIG2Globals. Skipping this image." | ||||||
|  |       end | ||||||
|   | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) -> |   | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) -> | ||||||
|       let pnm = name ^ ".pnm" in |       let pnm = name ^ ".pnm" in | ||||||
|       let png = name ^ ".png" in |       let png = name ^ ".png" in | ||||||
| @@ -99,6 +129,8 @@ let rec extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup | |||||||
|       extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images |       extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images | ||||||
|  |  | ||||||
| let extract_images ?(raw=false) ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf range stem = | let extract_images ?(raw=false) ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf range stem = | ||||||
|  |   Hashtbl.clear jbig2_globals; | ||||||
|  |   jbig2_serial := 0; | ||||||
|   if dedup || dedup_per_page then written := []; |   if dedup || dedup_per_page then written := []; | ||||||
|   let pdf_pages = Pdfpage.pages_of_pagetree pdf in |   let pdf_pages = Pdfpage.pages_of_pagetree pdf in | ||||||
|     let pages = |     let pages = | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user