2021-12-21 14:57:42 +00:00
open Pdfutil
open Pdfio
2023-12-18 15:13:40 +00:00
open Cpdferror
2021-12-21 14:57:42 +00:00
2023-12-28 16:18:25 +00:00
let debug_image_processing = ref false
2024-02-20 19:41:49 +00:00
let complain_jbig2enc path =
if path = " " then error " Specify jbig2enc location with -jbig2enc "
let complain_convert path =
2024-02-22 15:56:35 +00:00
if path = " " then error " Specify magick location with -im "
2024-02-20 19:41:49 +00:00
2023-12-31 11:13:58 +00:00
let remove x =
2024-02-20 19:41:49 +00:00
try (* Printf.printf "%s\n" x; *) Sys . remove x with _ -> ()
2023-12-31 11:13:58 +00:00
2023-12-18 22:39:33 +00:00
let pnm_white ch = output_char ch ' '
let pnm_newline ch = output_char ch '\n'
let pnm_output_string = Stdlib . output_string
let pnm_header ch w h =
pnm_white ch ;
pnm_output_string ch ( string_of_int w ) ;
pnm_white ch ;
pnm_output_string ch ( string_of_int h ) ;
pnm_white ch
let pnm_to_channel_24 ch w h s =
pnm_output_string ch " P6 " ;
pnm_header ch w h ;
pnm_output_string ch " 255 " ;
pnm_newline ch ;
2023-12-29 17:46:54 +00:00
bytes_to_output_channel ch s
2023-12-18 22:39:33 +00:00
let pnm_to_channel_8 ch w h s =
pnm_output_string ch " P5 " ;
pnm_header ch w h ;
2023-12-19 18:11:47 +00:00
pnm_output_string ch " 255 " ;
2023-12-18 22:39:33 +00:00
pnm_newline ch ;
2023-12-29 17:46:54 +00:00
bytes_to_output_channel ch s
2021-12-21 14:57:42 +00:00
2023-12-23 14:07:17 +00:00
let pnm_to_channel_1_inverted ch w h s =
2023-12-22 21:21:23 +00:00
pnm_output_string ch " P4 " ;
pnm_header ch w h ;
pnm_newline ch ;
2023-12-23 14:07:17 +00:00
let inverted = Pdfio . copybytes s in
Pdfio . bytes_selfmap lnot inverted ;
2023-12-29 17:46:54 +00:00
bytes_to_output_channel ch inverted
2023-12-22 21:21:23 +00:00
2023-12-19 19:12:56 +00:00
let cmyk_to_channel_32 ch w h s =
2023-12-29 17:46:54 +00:00
let inverted = Pdfio . copybytes s in
Pdfio . bytes_selfmap ( fun x -> 255 - x ) inverted ;
bytes_to_output_channel ch inverted
2023-12-19 19:12:56 +00:00
2023-12-04 11:19:17 +00:00
let jbig2_serial = ref 0
let jbig2_globals = null_hash ()
2023-12-04 14:00:45 +00:00
let write_stream name stream =
let fh = open_out_bin name in
Pdfio . bytes_to_output_channel fh stream ;
close_out fh
2023-11-10 13:46:52 +00:00
let write_image ~ raw ? path_to_p2p ? path_to_im pdf resources name image =
2023-06-14 18:38:26 +01:00
match Pdfimage . get_image_24bpp pdf resources image with
| Pdfimage . JPEG ( stream , _ ) -> write_stream ( name ^ " .jpg " ) stream
| Pdfimage . JPEG2000 ( stream , _ ) -> write_stream ( name ^ " .jpx " ) stream
2023-12-04 11:19:17 +00:00
| Pdfimage . JBIG2 ( stream , _ , global ) ->
begin match global with
| None ->
2023-12-23 17:18:13 +00:00
(* Printf.printf "JBIG2: No global, writing plain\n"; *)
2023-12-04 11:19:17 +00:00
write_stream ( name ^ " .jbig2 " ) stream
| Some g ->
2023-12-23 17:18:13 +00:00
(* Printf.printf "JBIG2: there is a global\n"; *)
2023-12-04 11:19:17 +00:00
let go () =
let serial , _ = Hashtbl . find jbig2_globals g in
write_stream ( name ^ " .jbig2__ " ^ string_of_int serial ) stream
in
try go () with Not_found ->
jbig2_serial + = 1 ;
let globaldata =
let obj = Pdf . lookup_obj pdf g in
Pdfcodec . decode_pdfstream_until_unknown pdf obj ;
match obj with | Pdf . Stream { contents = ( _ , Got b ) } -> Some b | _ -> None
in
match globaldata with
| Some d ->
Hashtbl . add jbig2_globals g ( ! jbig2_serial , d ) ;
let filename = Filename . concat ( Filename . dirname name ) ( string_of_int ! jbig2_serial ^ " .jbig2global " ) in
write_stream filename d ;
go ()
| None ->
Pdfe . log " Could not extract JBIG2Globals. Skipping this image. "
end
2023-06-14 18:38:26 +01:00
| Pdfimage . Raw ( w , h , Pdfimage . BPP24 , stream ) ->
let pnm = name ^ " .pnm " in
let png = name ^ " .png " in
let fh = open_out_bin pnm in
pnm_to_channel_24 fh w h stream ;
close_out fh ;
begin match path_to_p2p with
2023-11-10 13:46:52 +00:00
| None ->
2023-06-14 18:38:26 +01:00
begin match path_to_im with
2023-11-10 13:46:52 +00:00
None ->
if not raw then Pdfe . log " Neither pnm2png nor imagemagick found. Specify with -p2p or -im \n "
| Some path_to_im ->
2021-12-21 14:57:42 +00:00
begin match
2023-06-14 18:38:26 +01:00
Sys . command ( Filename . quote_command path_to_im [ pnm ; png ] )
2021-12-21 14:57:42 +00:00
with
2023-12-31 11:13:58 +00:00
0 -> remove pnm
2023-06-14 18:38:26 +01:00
| _ ->
Pdfe . log " Call to imagemagick failed: did you specify -p2p or -im correctly? \n " ;
2023-12-31 11:13:58 +00:00
remove pnm
2021-12-21 14:57:42 +00:00
end
end
2023-11-10 13:46:52 +00:00
| Some path_to_p2p ->
2023-06-14 18:38:26 +01:00
begin match
Sys . command ( Filename . quote_command path_to_p2p ~ stdout : png [ " -gamma " ; " 0.45 " ; " -quiet " ; pnm ] )
with
2023-12-31 11:13:58 +00:00
| 0 -> remove pnm
2023-06-14 18:38:26 +01:00
| _ ->
Pdfe . log " Call to pnmtopng failed: did you specify -p2p correctly? \n " ;
2023-12-31 11:13:58 +00:00
remove pnm
2023-06-14 18:38:26 +01:00
end
end
| _ ->
Pdfe . log ( Printf . sprintf " Unsupported image type when extracting image %s " name )
2021-12-21 14:57:42 +00:00
let written = ref []
2023-11-10 13:46:52 +00:00
let extract_images_inner ~ raw ? path_to_p2p ? path_to_im encoding serial pdf resources stem pnum images =
2021-12-21 14:57:42 +00:00
let names = map
( fun _ ->
Cpdfbookmarks . name_of_spec
encoding [] pdf 0 ( stem ^ " -p " ^ string_of_int pnum )
( let r = ! serial in serial := ! serial + 1 ; r ) " " 0 0 ) ( indx images )
in
2023-11-10 13:46:52 +00:00
iter2 ( write_image ~ raw ? path_to_p2p ? path_to_im pdf resources ) names images
2021-12-21 14:57:42 +00:00
2024-11-21 18:47:45 +00:00
let rec extract_images_form_xobject ~ raw ? path_to_p2p ? path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
2021-12-21 14:57:42 +00:00
let resources =
match Pdf . lookup_direct pdf " /Resources " form with
Some ( Pdf . Dictionary d ) -> Pdf . Dictionary d
| _ -> Pdf . Dictionary []
in
2024-11-21 18:47:45 +00:00
let images , forms =
2021-12-21 14:57:42 +00:00
let xobjects =
match Pdf . lookup_direct pdf " /XObject " resources with
| Some ( Pdf . Dictionary elts ) -> map snd elts
| _ -> []
in
(* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
2024-11-21 18:47:45 +00:00
let images , forms = List . partition ( fun o -> Pdf . lookup_direct pdf " /Subtype " o = Some ( Pdf . Name " /Image " ) ) xobjects in
2021-12-21 14:57:42 +00:00
let already_written , images = List . partition ( function Pdf . Indirect n -> mem n ! written | _ -> false ) images in
if dedup | | dedup_per_page then
written := ( option_map ( function Pdf . Indirect n -> Some n | _ -> None ) images ) @ ! written ;
2024-11-21 18:47:45 +00:00
images , forms
2021-12-21 14:57:42 +00:00
in
2024-11-21 18:47:45 +00:00
iter ( extract_images_form_xobject ~ raw ? path_to_p2p ? path_to_im encoding dedup dedup_per_page pdf serial stem pnum ) forms ;
2023-11-10 13:46:52 +00:00
extract_images_inner ~ raw ? path_to_p2p ? path_to_im encoding serial pdf resources stem pnum images
2021-12-21 14:57:42 +00:00
2023-11-10 13:46:52 +00:00
let extract_images ? ( raw = false ) ? path_to_p2p ? path_to_im encoding dedup dedup_per_page pdf range stem =
2023-12-04 11:19:17 +00:00
Hashtbl . clear jbig2_globals ;
jbig2_serial := 0 ;
2021-12-21 14:57:42 +00:00
if dedup | | dedup_per_page then written := [] ;
let pdf_pages = Pdfpage . pages_of_pagetree pdf in
let pages =
option_map
( function ( i , pdf_pages ) -> if mem i range then Some pdf_pages else None )
( combine ( indx pdf_pages ) pdf_pages )
in
let serial = ref 0 in
iter2
( fun page pnum ->
if dedup_per_page then written := [] ;
let xobjects =
match Pdf . lookup_direct pdf " /XObject " page . Pdfpage . resources with
| Some ( Pdf . Dictionary elts ) -> map snd elts
| _ -> []
in
let images = keep ( fun o -> Pdf . lookup_direct pdf " /Subtype " o = Some ( Pdf . Name " /Image " ) ) xobjects in
let already_written , images = List . partition ( function Pdf . Indirect n -> mem n ! written | _ -> false ) images in
if dedup | | dedup_per_page then
written := ( option_map ( function Pdf . Indirect n -> Some n | _ -> None ) images ) @ ! written ;
let forms = keep ( fun o -> Pdf . lookup_direct pdf " /Subtype " o = Some ( Pdf . Name " /Form " ) ) xobjects in
2023-11-10 13:46:52 +00:00
extract_images_inner ~ raw ? path_to_p2p ? path_to_im encoding serial pdf page . Pdfpage . resources stem pnum images ;
iter ( extract_images_form_xobject ~ raw ? path_to_p2p ? path_to_im encoding dedup dedup_per_page pdf serial stem pnum ) forms )
2021-12-21 14:57:42 +00:00
pages
( indx pages )
2021-12-21 15:25:59 +00:00
(* Image resolution *)
type xobj =
| Image of int * int (* width, height *)
| Form of Pdftransform . transform_matrix * Pdf . pdfobject * Pdf . pdfobject (* Will add actual data later. *)
let image_results = ref []
2024-10-10 17:38:34 +02:00
let rec image_resolution_page pdf page pagenum images =
2021-12-21 15:25:59 +00:00
try
let pageops = Pdfops . parse_operators pdf page . Pdfpage . resources page . Pdfpage . content
and transform = ref [ ref Pdftransform . i_matrix ] in
iter
( function
| Pdfops . Op_cm matrix ->
begin match ! transform with
| [] -> raise ( Failure " no transform " )
| _ -> ( hd ! transform ) := Pdftransform . matrix_compose ! ( hd ! transform ) matrix
end
| Pdfops . Op_Do xobject ->
let trans ( x , y ) =
match ! transform with
| [] -> raise ( Failure " no transform " )
| _ -> Pdftransform . transform_matrix ! ( hd ! transform ) ( x , y )
in
let o = trans ( 0 . , 0 . )
and x = trans ( 1 . , 0 . )
and y = trans ( 0 . , 1 . )
in
(* i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" ( fst o ) ( snd o ) ( fst x ) ( snd x ) ( fst y ) ( snd y ) ; i *)
let rec lookup_image k = function
| [] -> assert false
2024-01-04 17:40:15 +00:00
| ( _ , a , _ , _ ) as h :: _ when a = k -> h
2021-12-21 15:25:59 +00:00
| _ :: t -> lookup_image k t
in
begin match lookup_image xobject images with
2024-01-04 17:40:15 +00:00
| ( pagenum , name , Form ( xobj_matrix , content , resources ) , objnum ) ->
2021-12-21 15:25:59 +00:00
let content =
(* Add in matrix etc. *)
let total_matrix = Pdftransform . matrix_compose xobj_matrix ! ( hd ! transform ) in
let ops =
Pdfops . Op_cm total_matrix ::
Pdfops . parse_operators pdf resources [ content ]
in
Pdfops . stream_of_ops ops
in
let page =
{ Pdfpage . content = [ content ] ;
Pdfpage . mediabox = Pdfpage . rectangle_of_paper Pdfpaper . a4 ;
Pdfpage . resources = resources ;
Pdfpage . rotate = Pdfpage . Rotate0 ;
Pdfpage . rest = Pdf . Dictionary [] }
in
let newpdf = Pdfpage . change_pages false pdf [ page ] in
2024-11-22 16:30:22 +00:00
image_resolution newpdf [ 1 ] pagenum
2024-01-04 17:40:15 +00:00
| ( pagenum , name , Image ( w , h ) , objnum ) ->
2024-01-10 13:10:39 +00:00
let lx = Pdfunits . inches ( distance_between o x ) Pdfunits . PdfPoint in
let ly = Pdfunits . inches ( distance_between o y ) Pdfunits . PdfPoint in
2021-12-21 15:25:59 +00:00
let wdpi = float w /. lx
and hdpi = float h /. ly in
2024-11-22 15:57:28 +00:00
image_results := ( pagenum , xobject , w , h , wdpi , hdpi , objnum ) :: ! image_results ;
(* Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi; *)
2021-12-21 15:25:59 +00:00
end
| Pdfops . Op_q ->
begin match ! transform with
| [] -> raise ( Failure " Unbalanced q/Q ops " )
| h :: t ->
let h' = ref Pdftransform . i_matrix in
h' := ! h ;
transform := h' :: h :: t
end
| Pdfops . Op_Q ->
begin match ! transform with
| [] -> raise ( Failure " Unbalanced q/Q ops " )
| _ -> transform := tl ! transform
end
| _ -> () )
pageops
with
e -> Printf . printf " Error %s \n " ( Printexc . to_string e ) ; flprint " \n "
2024-11-22 16:30:22 +00:00
and image_resolution pdf range real_pagenum =
2021-12-21 15:25:59 +00:00
let images = ref [] in
Cpdfpage . iter_pages
( fun pagenum page ->
2024-11-22 16:30:22 +00:00
let pagenum = if real_pagenum > 0 then real_pagenum else pagenum in
2021-12-21 15:25:59 +00:00
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
match Pdf . lookup_direct pdf " /XObject " page . Pdfpage . resources with
| Some ( Pdf . Dictionary xobjects ) ->
iter
( function ( name , xobject ) ->
2024-01-04 17:40:15 +00:00
let objnum = match xobject with Pdf . Indirect i -> i | _ -> 0 in
2021-12-21 15:25:59 +00:00
match Pdf . lookup_direct pdf " /Subtype " xobject with
| Some ( Pdf . Name " /Image " ) ->
let width =
match Pdf . lookup_direct pdf " /Width " xobject with
2022-07-14 14:06:25 +01:00
| Some x -> Pdf . getnum pdf x
2021-12-21 15:25:59 +00:00
| None -> 1 .
and height =
match Pdf . lookup_direct pdf " /Height " xobject with
2022-07-14 14:06:25 +01:00
| Some x -> Pdf . getnum pdf x
2021-12-21 15:25:59 +00:00
| None -> 1 .
in
2024-01-04 17:40:15 +00:00
images := ( pagenum , name , Image ( int_of_float width , int_of_float height ) , objnum ) :: ! images
2021-12-21 15:25:59 +00:00
| Some ( Pdf . Name " /Form " ) ->
let resources =
match Pdf . lookup_direct pdf " /Resources " xobject with
| None -> page . Pdfpage . resources (* Inherit from page or form above. *)
| Some r -> r
and contents =
xobject
and matrix =
match Pdf . lookup_direct pdf " /Matrix " xobject with
| Some ( Pdf . Array [ a ; b ; c ; d ; e ; f ] ) ->
2022-07-14 14:06:25 +01:00
{ Pdftransform . a = Pdf . getnum pdf a ; Pdftransform . b = Pdf . getnum pdf b ; Pdftransform . c = Pdf . getnum pdf c ;
Pdftransform . d = Pdf . getnum pdf d ; Pdftransform . e = Pdf . getnum pdf e ; Pdftransform . f = Pdf . getnum pdf f }
2021-12-21 15:25:59 +00:00
| _ -> Pdftransform . i_matrix
in
2024-01-04 17:40:15 +00:00
images := ( pagenum , name , Form ( matrix , contents , resources ) , objnum ) :: ! images
2021-12-21 15:25:59 +00:00
| _ -> ()
)
xobjects
| _ -> () )
pdf
2024-11-22 16:30:22 +00:00
( if real_pagenum = 0 then range else [ 1 ] ) ;
2021-12-21 15:25:59 +00:00
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
let pagesplits =
map
2024-01-04 17:40:15 +00:00
( function ( a , _ , _ , _ ) :: _ as ls -> ( a , ls ) | _ -> assert false )
( collate ( fun ( a , _ , _ , _ ) ( b , _ , _ , _ ) -> compare a b ) ( rev ! images ) )
2021-12-21 15:25:59 +00:00
and pages =
Pdfpage . pages_of_pagetree pdf
in
iter
( function ( pagenum , images ) ->
2024-11-22 16:30:22 +00:00
let pagenum = if real_pagenum > 0 then 1 else pagenum in
2021-12-21 15:25:59 +00:00
let page = select pagenum pages in
2024-10-10 17:38:34 +02:00
image_resolution_page pdf page pagenum images )
2021-12-21 15:25:59 +00:00
pagesplits
2024-10-10 17:38:34 +02:00
let is_below_dpi dpi ( _ , _ , _ , _ , wdpi , hdpi , _ ) =
wdpi < dpi | | hdpi < dpi
2021-12-21 15:25:59 +00:00
let image_resolution pdf range dpi =
image_results := [] ;
2024-11-22 16:30:22 +00:00
image_resolution pdf range 0 ;
sort compare ( rev ( keep ( is_below_dpi dpi ) ! image_results ) )
2021-12-21 15:25:59 +00:00
2024-02-07 14:42:38 +00:00
let image_resolution_json pdf range dpi =
let images = image_resolution pdf range dpi in
Pdfio . bytes_of_string
( Cpdfyojson . Safe . pretty_to_string
( ` List ( map ( fun ( pagenum , xobject , w , h , wdpi , hdpi , objnum ) ->
` Assoc [ ( " Object " , ` Int objnum ) ; ( " Page " , ` Int pagenum ) ; ( " XObject " , ` String xobject ) ;
( " W " , ` Int w ) ; ( " H " , ` Int h ) ; ( " Xdpi " , ` Float wdpi ) ; ( " Ydpi " , ` Float hdpi ) ] ) images ) ) )
2023-11-14 16:45:49 +00:00
(* All the images in file referenced at least once from the given range of pages. *)
2023-11-14 16:19:47 +00:00
let images pdf range =
let images = null_hash () in
2023-11-14 17:47:44 +00:00
let formnums = null_hash () in
let rec process_xobject resources pagenum page ( name , xobject ) =
match Pdf . lookup_direct pdf " /Subtype " xobject with
| Some ( Pdf . Name " /Image " ) ->
begin match xobject with
| Pdf . Indirect i ->
begin match Hashtbl . find images i with
2023-12-28 15:48:30 +00:00
| ( pagenums , n , w , h , s , bpc , cs , f ) ->
Hashtbl . replace images i ( pagenum :: pagenums , n , w , h , s , bpc , cs , f )
2023-11-14 17:47:44 +00:00
| exception Not_found ->
let width =
match Pdf . lookup_direct pdf " /Width " xobject with
| Some x -> Pdf . getnum pdf x
| None -> 1 .
and height =
match Pdf . lookup_direct pdf " /Height " xobject with
| Some x -> Pdf . getnum pdf x
| None -> 1 .
2023-12-28 15:48:30 +00:00
and size =
match Pdf . lookup_direct pdf " /Length " xobject with
| Some ( Pdf . Integer x ) -> x
| _ -> 0
and bpc =
match Pdf . lookup_direct pdf " /BitsPerComponent " xobject with
2024-03-22 15:33:08 +00:00
| Some ( Pdf . Integer x ) -> Some x
| _ -> None
2023-11-14 17:47:44 +00:00
and colourspace =
match Pdf . lookup_direct pdf " /ColorSpace " xobject with
| Some x -> Some ( Pdfspace . string_of_colourspace ( Pdfspace . read_colourspace pdf resources x ) )
| None -> None
2023-12-28 15:48:30 +00:00
and filter =
match Pdf . lookup_direct pdf " /Filter " xobject with
| Some ( Pdf . Array [ x ] ) | Some x -> Some ( Pdfwrite . string_of_pdf x )
| None -> None
2023-11-14 17:47:44 +00:00
in
2023-12-28 15:48:30 +00:00
Hashtbl . replace images i ( [ pagenum ] , name , int_of_float width , int_of_float height , size , bpc , colourspace , filter )
2023-11-14 17:47:44 +00:00
end
| _ -> ()
end
| Some ( Pdf . Name " /Form " ) ->
begin match xobject with
| Pdf . Indirect i ->
begin match Hashtbl . find formnums i with
| () -> ()
| exception Not_found ->
Hashtbl . add formnums i () ;
begin match Pdf . lookup_direct pdf " /Resources " xobject with
| Some r ->
begin match Pdf . lookup_direct pdf " /XObject " r with
| Some ( Pdf . Dictionary xobjects ) -> iter ( process_xobject r pagenum page ) xobjects
| _ -> ()
end
| None -> ()
end
end
| _ -> ()
end
| _ -> ()
in
Cpdfpage . iter_pages
( fun pagenum page ->
match Pdf . lookup_direct pdf " /XObject " page . Pdfpage . resources with
| Some ( Pdf . Dictionary xobjects ) ->
iter ( process_xobject page . Pdfpage . resources pagenum page ) xobjects
| _ -> () )
pdf
range ;
let images = list_of_hashtbl images in
2023-12-28 15:48:30 +00:00
let images = map ( fun ( i , ( pnums , n , w , h , s , bpc , c , filter ) ) -> ( i , ( setify ( sort compare pnums ) , n , w , h , s , bpc , c , filter ) ) ) images in
let images = sort ( fun ( _ , ( pnums , _ , _ , _ , _ , _ , _ , _ ) ) ( _ , ( pnums' , _ , _ , _ , _ , _ , _ , _ ) ) -> compare ( hd pnums ) ( hd pnums' ) ) images in
2023-11-14 17:47:44 +00:00
` List
( map
2023-12-28 15:48:30 +00:00
( fun ( i , ( pnums , n , w , h , size , bpc , cs , filter ) ) ->
2023-11-14 17:47:44 +00:00
` Assoc [ ( " Object " , ` Int i ) ;
( " Pages " , ` List ( map ( fun x -> ` Int x ) pnums ) ) ;
( " Name " , ` String n ) ;
( " Width " , ` Int w ) ;
( " Height " , ` Int h ) ;
2023-12-28 15:48:30 +00:00
( " Bytes " , ` Int size ) ;
2024-03-22 15:33:08 +00:00
( " BitsPerComponent " , match bpc with None -> ` Null | Some bpc -> ` Int bpc ) ;
2023-12-28 15:48:30 +00:00
( " Colourspace " , match cs with None -> ` Null | Some s -> ` String s ) ;
( " Filter " , match filter with None -> ` Null | Some s -> ` String s ) ] )
2023-11-14 17:47:44 +00:00
images )
2023-11-13 17:55:59 +00:00
2023-08-14 16:15:11 +01:00
let obj_of_jpeg_data data =
let w , h = Cpdfjpeg . jpeg_dimensions data in
let d =
[ " /Length " , Pdf . Integer ( Pdfio . bytes_size data ) ;
" /Filter " , Pdf . Name " /DCTDecode " ;
" /BitsPerComponent " , Pdf . Integer 8 ;
" /ColorSpace " , Pdf . Name " /DeviceRGB " ;
" /Subtype " , Pdf . Name " /Image " ;
" /Width " , Pdf . Integer w ;
" /Height " , Pdf . Integer h ]
in
2023-12-04 16:32:12 +00:00
Pdf . Stream { contents = ( Pdf . Dictionary d , Pdf . Got data ) } , []
2023-08-14 16:15:11 +01:00
let obj_of_png_data data =
let png = Cpdfpng . read_png ( Pdfio . input_of_bytes data ) in
2024-01-01 11:46:32 +00:00
let d =
[ " /Length " , Pdf . Integer ( Pdfio . bytes_size png . idat ) ;
" /Filter " , Pdf . Name " /FlateDecode " ;
" /Subtype " , Pdf . Name " /Image " ;
" /BitsPerComponent " , Pdf . Integer png . bitdepth ;
" /ColorSpace " , Pdf . Name ( match png . colortype with 0 -> " /DeviceGray " | 2 -> " /DeviceRGB " | _ -> error " obj_of_png_data 1 " ) ;
" /DecodeParms " , Pdf . Dictionary
[ " /BitsPerComponent " , Pdf . Integer png . bitdepth ;
" /Colors " , Pdf . Integer ( match png . colortype with 0 -> 1 | 2 -> 3 | _ -> error " obj_of_png_data 2 " ) ;
" /Columns " , Pdf . Integer png . width ;
" /Predictor " , Pdf . Integer 15 ] ;
" /Width " , Pdf . Integer png . width ;
" /Height " , Pdf . Integer png . height ]
in
Pdf . Stream { contents = ( Pdf . Dictionary d , Pdf . Got png . idat ) } , []
2023-12-04 13:39:56 +00:00
2024-03-22 13:57:04 +00:00
let obj_of_jpeg2000_data data =
let w , h = Cpdfjpeg2000 . jpeg2000_dimensions data in
let d =
[ " /Length " , Pdf . Integer ( Pdfio . bytes_size data ) ;
" /Filter " , Pdf . Name " /JPXDecode " ;
" /Subtype " , Pdf . Name " /Image " ;
" /Width " , Pdf . Integer w ;
" /Height " , Pdf . Integer h ]
in
Pdf . Stream { contents = ( Pdf . Dictionary d , Pdf . Got data ) } , []
2023-12-04 14:00:45 +00:00
let jbig2_dimensions data =
( bget data 11 * 256 * 256 * 256 + bget data 12 * 256 * 256 + bget data 13 * 256 + bget data 14 ,
bget data 15 * 256 * 256 * 256 + bget data 16 * 256 * 256 + bget data 17 * 256 + bget data 18 )
2023-12-04 16:58:13 +00:00
let obj_of_jbig2_data ? global data =
let d , extra =
let decodeparms , extra =
match global with
| Some data ->
[ ( " /DecodeParms " , Pdf . Dictionary [ ( " /JBIG2Globals " , Pdf . Indirect 10000 ) ] ) ] ,
2023-12-04 17:15:15 +00:00
[ ( 10000 , Pdf . Stream { contents = ( Pdf . Dictionary [ ( " /Length " , Pdf . Integer ( bytes_size data ) ) ] , Pdf . Got data ) } ) ]
2023-12-04 16:58:13 +00:00
| None ->
[] , []
in
2023-12-04 14:00:45 +00:00
let w , h = jbig2_dimensions data in
2023-12-04 13:39:56 +00:00
[ ( " /Length " , Pdf . Integer ( Pdfio . bytes_size data ) ) ;
( " /Filter " , Pdf . Name " /JBIG2Decode " ) ;
( " /Subtype " , Pdf . Name " /Image " ) ;
( " /BitsPerComponent " , Pdf . Integer 1 ) ;
( " /ColorSpace " , Pdf . Name " /DeviceGray " ) ;
( " /Width " , Pdf . Integer w ) ;
( " /Height " , Pdf . Integer h ) ]
2023-12-04 16:58:13 +00:00
@ decodeparms , extra
2023-12-04 13:39:56 +00:00
in
2023-12-04 16:58:13 +00:00
Pdf . Stream { contents = ( Pdf . Dictionary d , Pdf . Got data ) } , extra
2023-08-14 16:15:11 +01:00
2024-09-30 18:26:39 +01:00
let image_of_input ? subformat ? title ~ process_struct_tree fobj i =
2024-09-30 18:24:10 +01:00
let pdf , title =
2024-09-30 15:13:56 +01:00
match subformat with
2024-09-30 18:24:10 +01:00
| None -> Pdf . empty () , begin match title with Some x -> x | None -> " " end
2024-09-30 15:13:56 +01:00
| Some Cpdfua . PDFUA1 ->
begin match title with
| None -> error " no -title given "
2024-09-30 18:24:10 +01:00
| Some title -> Cpdfua . create_pdfua1 title Pdfpaper . a4 1 , title
2024-09-30 15:13:56 +01:00
end
| Some Cpdfua . PDFUA2 ->
begin match title with
| None -> error " no -title given "
2024-09-30 18:24:10 +01:00
| Some title -> Cpdfua . create_pdfua2 title Pdfpaper . a4 1 , title
2024-09-30 15:13:56 +01:00
end
in
2023-08-14 16:15:11 +01:00
let data = Pdfio . bytes_of_input i 0 i . Pdfio . in_channel_length in
2023-12-04 17:15:15 +00:00
let obj , extras = fobj () data in
2023-12-04 16:58:13 +00:00
iter ( Pdf . addobj_given_num pdf ) extras ;
2023-08-14 16:15:11 +01:00
let w = match Pdf . lookup_direct pdf " /Width " obj with Some x -> Pdf . getnum pdf x | _ -> assert false in
let h = match Pdf . lookup_direct pdf " /Height " obj with Some x -> Pdf . getnum pdf x | _ -> assert false in
2024-10-01 15:33:53 +01:00
let structinfo =
match process_struct_tree , subformat with
| _ , ( Some Cpdfua . PDFUA1 | Some Cpdfua . PDFUA2 ) | true , _ -> true
| _ -> false
in
2024-10-03 14:47:57 +01:00
if subformat = Some Cpdfua . PDFUA2 then
2024-10-01 16:44:44 +01:00
begin
let str = Pdf . addobj pdf Pdf . Null in
let figure = Pdf . addobj pdf Pdf . Null in
let parent_tree = Pdf . addobj pdf Pdf . Null in
let namespace = Pdf . addobj pdf ( Pdf . Dictionary [ ( " /NS " , Pdf . String " http://iso.org/pdf2/ssn " ) ] ) in
let document = Pdf . addobj pdf Pdf . Null in
Pdf . addobj_given_num pdf ( document , Pdf . Dictionary [ ( " /K " , Pdf . Array [ Pdf . Indirect figure ] ) ; ( " /P " , Pdf . Indirect str ) ; ( " /S " , Pdf . Name " /Document " ) ; ( " /NS " , Pdf . Indirect namespace ) ] ) ;
Pdf . addobj_given_num pdf ( parent_tree , Pdf . Dictionary [ ( " /Nums " , Pdf . Array [ Pdf . Integer 1 ; Pdf . Array [ Pdf . Indirect figure ] ] ) ] ) ;
Pdf . addobj_given_num pdf ( figure , Pdf . Dictionary [ ( " /K " , Pdf . Array [ Pdf . Integer 0 ] ) ; ( " /P " , Pdf . Indirect document ) ; ( " /S " , Pdf . Name " /Figure " ) ; ( " /Alt " , Pdf . String title ) ] ) ;
Pdf . addobj_given_num pdf ( str , Pdf . Dictionary [ ( " /Namespaces " , Pdf . Array [ Pdf . Indirect namespace ] ) ; ( " /Type " , Pdf . Name " /StructTreeRoot " ) ;
( " /K " , Pdf . Array [ Pdf . Indirect document ] ) ; ( " /ParentTree " , Pdf . Indirect parent_tree ) ] ) ;
2024-10-23 13:44:31 +01:00
Pdf . replace_chain pdf [ " /Root " ; " /StructTreeRoot " ] ( Pdf . Indirect str )
2024-10-01 16:44:44 +01:00
end
2024-10-03 14:47:57 +01:00
else if process_struct_tree | | subformat = Some Cpdfua . PDFUA1 then
2024-10-01 15:33:53 +01:00
begin
let str = Pdf . addobj pdf Pdf . Null in
let figure = Pdf . addobj pdf Pdf . Null in
let parent_tree = Pdf . addobj pdf Pdf . Null in
Pdf . addobj_given_num pdf ( parent_tree , Pdf . Dictionary [ ( " /Nums " , Pdf . Array [ Pdf . Integer 1 ; Pdf . Array [ Pdf . Indirect figure ] ] ) ] ) ;
Pdf . addobj_given_num pdf ( figure , Pdf . Dictionary [ ( " /K " , Pdf . Array [ Pdf . Integer 0 ] ) ; ( " /P " , Pdf . Indirect str ) ; ( " /S " , Pdf . Name " /Figure " ) ; ( " /Alt " , Pdf . String title ) ] ) ;
Pdf . addobj_given_num pdf ( str , Pdf . Dictionary [ ( " /Type " , Pdf . Name " /StructTreeRoot " ) ; ( " /K " , Pdf . Array [ Pdf . Indirect figure ] ) ; ( " /ParentTree " , Pdf . Indirect parent_tree ) ] ) ;
2024-10-23 13:44:31 +01:00
Pdf . replace_chain pdf [ " /Root " ; " /StructTreeRoot " ] ( Pdf . Indirect str )
2024-10-01 15:33:53 +01:00
end ;
let ops =
( if structinfo then [ Pdfops . Op_BDC ( " /Figure " , Pdf . Dictionary [ ( " /MCID " , Pdf . Integer 0 ) ] ) ] else [] )
@ [ Pdfops . Op_cm ( Pdftransform . matrix_of_transform [ Pdftransform . Translate ( 0 . , 0 . ) ;
Pdftransform . Scale ( ( 0 . , 0 . ) , w , h ) ] ) ;
Pdfops . Op_Do " /I0 " ]
@ ( if structinfo then [ Pdfops . Op_EMC ] else [] )
in
2023-08-14 16:15:11 +01:00
let page =
2024-10-01 15:33:53 +01:00
{ Pdfpage . content = [ Pdfops . stream_of_ops ops ] ;
2023-08-14 16:15:11 +01:00
Pdfpage . mediabox = Pdf . Array [ Pdf . Real 0 . ; Pdf . Real 0 . ; Pdf . Real w ; Pdf . Real h ] ;
2024-10-01 15:33:53 +01:00
Pdfpage . resources = Pdf . Dictionary [ " /XObject " , Pdf . Dictionary [ " /I0 " , Pdf . Indirect ( Pdf . addobj pdf obj ) ] ] ;
2023-08-14 16:15:11 +01:00
Pdfpage . rotate = Pdfpage . Rotate0 ;
2024-10-01 15:33:53 +01:00
Pdfpage . rest = if structinfo then Pdf . Dictionary [ ( " /StructParents " , Pdf . Integer 1 ) ] else Pdf . Dictionary [] }
2023-08-14 16:15:11 +01:00
in
let pdf , pageroot = Pdfpage . add_pagetree [ page ] pdf in
Pdfpage . add_root pageroot [] pdf
2023-12-06 12:20:27 +00:00
2024-11-06 16:18:57 +00:00
let jpeg_to_jpeg pdf ~ pixel_threshold ~ length_threshold ~ percentage_threshold ~ jpeg_to_jpeg_scale ~ interpolate ~ q ~ path_to_convert s dict reference =
2024-02-20 19:41:49 +00:00
if q < 0 . | | q > 100 . then error " Out of range quality " ;
complain_convert path_to_convert ;
2023-12-24 13:54:21 +00:00
let w = match Pdf . lookup_direct pdf " /Width " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad width " in
let h = match Pdf . lookup_direct pdf " /Height " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad height " in
2023-12-29 21:49:56 +00:00
if w * h < pixel_threshold then ( if ! debug_image_processing then Printf . printf " pixel threshold not met \n %! " ) else
2023-12-20 12:11:55 +00:00
Pdf . getstream s ;
2023-12-28 11:32:43 +00:00
let size = match Pdf . lookup_direct pdf " /Length " dict with Some ( Pdf . Integer i ) -> i | _ -> 0 in
2023-12-29 21:49:56 +00:00
if size < length_threshold then ( if ! debug_image_processing then Printf . printf " length threshold not met \n %! " ) else
2024-02-21 18:41:26 +00:00
let out = Filename . temp_file " cpdf " " convertin.jpg " in
let out2 = Filename . temp_file " cpdf " " convertout.jpg " in
2023-12-20 12:11:55 +00:00
let fh = open_out_bin out in
2023-12-28 11:32:43 +00:00
begin match s with Pdf . Stream { contents = _ , Pdf . Got d } -> Pdfio . bytes_to_output_channel fh d | _ -> () end ;
2023-12-20 12:11:55 +00:00
close_out fh ;
let retcode =
2024-11-06 15:15:32 +00:00
let scaling =
if jpeg_to_jpeg_scale < > 100 . then
[ ( if interpolate then " -sample " else " -resize " ) ; string_of_float jpeg_to_jpeg_scale ^ " % " ]
else
[]
in
2023-12-20 12:11:55 +00:00
let command =
2024-11-06 15:15:32 +00:00
Filename . quote_command path_to_convert ( [ out ] @ scaling @ [ " -quality " ; string_of_float q ^ " % " ; out2 ] )
2023-12-20 12:11:55 +00:00
in
2024-12-09 16:55:40 +00:00
(* Printf.printf "%S\n" command; *) Sys . command command
2023-12-20 12:11:55 +00:00
in
if retcode = 0 then
begin
2024-02-20 19:41:49 +00:00
try
let result = open_in_bin out2 in
let newsize = in_channel_length result in
let perc_ok = float newsize /. float size < percentage_threshold /. 100 . in
if newsize < size && perc_ok then
begin
2024-11-06 14:54:02 +00:00
let data = Pdfio . bytes_of_input_channel result in
2024-12-18 13:29:42 +00:00
let w , h = try Cpdfjpeg . jpeg_dimensions data with _ -> ( w , h ) in (* TODO. https://github.com/johnwhitington/cpdf-source/issues/349 *)
2024-02-20 19:41:49 +00:00
if ! debug_image_processing then Printf . printf " JPEG to JPEG %i -> %i (%i%%) \n %! " size newsize ( int_of_float ( float newsize /. float size * . 100 . ) ) ;
2024-11-06 14:54:02 +00:00
reference :=
Pdf . add_dict_entry ( Pdf . add_dict_entry ( Pdf . add_dict_entry dict " /Length " ( Pdf . Integer newsize ) ) " /Width " ( Pdf . Integer w ) ) " /Height " ( Pdf . Integer h ) ,
2024-11-06 15:15:32 +00:00
Pdf . Got data
2024-02-20 19:41:49 +00:00
end
else
begin
if ! debug_image_processing then Printf . printf " no size reduction \n %! "
end ;
close_in result
2024-12-17 18:25:25 +00:00
with e ->
2024-12-18 14:25:10 +00:00
if ! debug_image_processing then Printf . printf " Error %S \n %! " ( Printexc . to_string e ) ;
2024-02-20 19:41:49 +00:00
remove out ;
remove out2
2023-12-31 11:13:58 +00:00
end
else
2024-12-18 14:25:10 +00:00
if ! debug_image_processing then Printf . printf " external process failed \n %! " ;
2024-12-18 14:20:29 +00:00
remove out ;
remove out2
2023-12-20 12:11:55 +00:00
let suitable_num pdf dict =
match Pdf . lookup_direct pdf " /ColorSpace " dict with
2024-02-19 17:56:35 +00:00
| Some ( Pdf . Name ( " /DeviceRGB " | " /CalRGB " ) ) -> 3
| Some ( Pdf . Name ( " /DeviceGray " | " /CalGray " ) ) -> 1
2023-12-20 12:11:55 +00:00
| Some ( Pdf . Name " /DeviceCMYK " ) -> 4
2024-02-19 17:56:35 +00:00
| Some ( Pdf . Array [ Pdf . Name " /Lab " ; _ ] ) -> 3
2023-12-20 12:11:55 +00:00
| Some ( Pdf . Array [ Pdf . Name " /ICCBased " ; stream ] ) ->
begin match Pdf . lookup_direct pdf " /N " stream with
| Some ( Pdf . Integer 3 ) -> 3
| Some ( Pdf . Integer 1 ) -> 1
| Some ( Pdf . Integer 4 ) -> 4
| _ -> 0
end
2024-01-02 17:54:16 +00:00
| Some ( Pdf . Array ( Pdf . Name ( " /Separation " ) :: _ ) ) -> ~ - 1
| Some ( Pdf . Array ( Pdf . Name ( " /Indexed " ) :: _ ) ) -> ~ - 2
2023-12-20 12:11:55 +00:00
| _ -> 0
2023-12-29 17:22:02 +00:00
let lossless_out pdf ~ pixel_threshold ~ length_threshold extension s dict reference =
2024-01-03 17:43:51 +00:00
let old = ! reference in
let restore () = reference := old in
2023-12-20 12:11:55 +00:00
let bpc = Pdf . lookup_direct pdf " /BitsPerComponent " dict in
let components = suitable_num pdf dict in
match components , bpc with
2024-01-02 17:54:16 +00:00
| ( 1 | 3 | 4 | - 1 | - 2 ) , Some ( Pdf . Integer 8 ) ->
2023-12-24 13:54:21 +00:00
let w = match Pdf . lookup_direct pdf " /Width " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad width " in
let h = match Pdf . lookup_direct pdf " /Height " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad height " in
2023-12-29 21:49:56 +00:00
if w * h < pixel_threshold then ( if ! debug_image_processing then Printf . printf " pixel threshold not met \n %! " ; None ) else
2023-12-20 12:11:55 +00:00
let size = match Pdf . lookup_direct pdf " /Length " dict with Some ( Pdf . Integer i ) -> i | _ -> 0 in
2023-12-29 21:49:56 +00:00
if size < length_threshold then ( if ! debug_image_processing then Printf . printf " length threshold not met \n %! " ; None ) else
2023-12-29 17:22:02 +00:00
begin
Pdfcodec . decode_pdfstream_until_unknown pdf s ;
2024-12-18 14:25:10 +00:00
match Pdf . lookup_direct pdf " /Filter " ( fst ! reference ) with Some x -> restore () ; if ! debug_image_processing then Printf . printf " %S Unable to decompress \n %! " ( Pdfwrite . string_of_pdf x ) ; None | None ->
2024-02-21 18:41:26 +00:00
let out = Filename . temp_file " cpdf " ( " convertin " ^ ( if suitable_num pdf dict < 4 then " .pnm " else " .cmyk " ) ) in
let out2 = Filename . temp_file " cpdf " ( " convertout " ^ extension ) in
2023-12-20 12:11:55 +00:00
let fh = open_out_bin out in
let data = match s with Pdf . Stream { contents = _ , Pdf . Got d } -> d | _ -> assert false in
( if components = 3 then pnm_to_channel_24 else
if components = 4 then cmyk_to_channel_32 else pnm_to_channel_8 ) fh w h data ;
close_out fh ;
2023-12-29 17:22:02 +00:00
Some ( out , out2 , size , components , w , h )
2023-12-20 12:11:55 +00:00
end
| colspace , bpc ->
2024-02-20 19:41:49 +00:00
(* let colspace = Pdf.lookup_direct pdf "/ColorSpace" dict in
2023-12-20 12:11:55 +00:00
let colspace , bpc , filter =
( match colspace with None -> " none " | Some x -> Pdfwrite . string_of_pdf x ) ,
( match bpc with None -> " none " | Some x -> Pdfwrite . string_of_pdf x ) ,
( match Pdf . lookup_direct pdf " /Filter " dict with None -> " none " | Some x -> Pdfwrite . string_of_pdf x )
in
2023-12-22 16:12:19 +00:00
print_string ( Pdfwrite . string_of_pdf dict ) ;
2024-02-20 19:41:49 +00:00
print_string ( Printf . sprintf " %s (%s) [%s] \n " colspace bpc filter ) ; * )
2023-12-31 11:13:58 +00:00
if ! debug_image_processing then Printf . printf " colourspace not suitable \n %! " ;
2024-01-03 17:43:51 +00:00
restore () ;
2023-12-29 17:22:02 +00:00
None (* an image we cannot or do not handle *)
let lossless_to_jpeg pdf ~ pixel_threshold ~ length_threshold ~ percentage_threshold ~ qlossless ~ path_to_convert s dict reference =
2024-02-20 19:41:49 +00:00
complain_convert path_to_convert ;
2024-02-21 17:03:22 +00:00
match lossless_out pdf ~ pixel_threshold ~ length_threshold " .jpg " s dict reference with
| None -> ()
| Some ( _ , _ , _ , - 2 , _ , _ ) ->
if ! debug_image_processing then Printf . printf " skipping indexed colorspace \n %! "
| Some ( out , out2 , size , components , w , h ) ->
2023-12-29 17:22:02 +00:00
let retcode =
let command =
( Filename . quote_command path_to_convert
( ( if components = 4 then [ " -depth " ; " 8 " ; " -size " ; string_of_int w ^ " x " ^ string_of_int h ] else [] ) @
2024-02-01 16:38:07 +00:00
[ out ; " -quality " ; string_of_float qlossless ^ " % " ] @
2024-12-17 15:53:55 +00:00
( if components = 1 then [ " -colorspace " ; " Gray " ] else if components = 4 then [ " -colorspace " ; " CMYK " ] else [ " -type " ; " truecolor " ] ) @
2023-12-29 17:22:02 +00:00
[ out2 ] ) )
in
(* Printf.printf "%S\n" command; *) Sys . command command
in
if retcode = 0 then
begin
2024-02-20 19:41:49 +00:00
try
let result = open_in_bin out2 in
let newsize = in_channel_length result in
let perc_ok = float newsize /. float size < percentage_threshold /. 100 . in
if newsize < size && perc_ok then
begin
if ! debug_image_processing then Printf . printf " lossless to JPEG %i -> %i (%i%%) \n %! " size newsize ( int_of_float ( float newsize /. float size * . 100 . ) ) ;
reference :=
( Pdf . add_dict_entry
( Pdf . add_dict_entry dict " /Length " ( Pdf . Integer newsize ) )
" /Filter "
( Pdf . Name " /DCTDecode " ) ) ,
Pdf . Got ( Pdfio . bytes_of_input_channel result )
end
else
begin
if ! debug_image_processing then Printf . printf " no size reduction \n %! "
end ;
close_in result
with
2024-12-18 14:20:29 +00:00
e ->
2024-12-18 14:25:10 +00:00
if ! debug_image_processing then Printf . printf " Failed with %S \n %! " ( Printexc . to_string e ) ;
2024-02-20 19:41:49 +00:00
remove out ;
remove out2
2024-12-18 14:20:29 +00:00
end
else
2024-12-18 14:25:10 +00:00
if ! debug_image_processing then Printf . printf " Return code not zero \n %! " ;
2023-12-31 11:13:58 +00:00
remove out ;
remove out2
2023-12-29 17:22:02 +00:00
2024-02-20 14:59:14 +00:00
let test_components pdf dict =
match suitable_num pdf dict with - 1 | - 2 -> 1 | x -> x
let test_bpc pdf dict =
match Pdf . lookup_direct pdf " /BitsPerComponent " dict with
| Some ( Pdf . Integer i ) -> i
| _ -> 0
2024-02-01 13:29:20 +00:00
let lossless_resample pdf ~ pixel_threshold ~ length_threshold ~ factor ~ interpolate ~ path_to_convert s dict reference =
2024-02-20 19:41:49 +00:00
complain_convert path_to_convert ;
2024-02-20 14:59:14 +00:00
let in_components = test_components pdf dict in
let in_bpc = test_bpc pdf dict in
2024-02-20 19:41:49 +00:00
(* Printf.printf " * * * lossless_resample IN dictionary: %S\n" ( Pdfwrite.string_of_pdf dict ) ; *)
2024-02-20 16:02:56 +00:00
(* Printf.printf "\n * * * IN components = %i, bpc = %i\n" in_components in_bpc; *)
2024-01-02 14:58:35 +00:00
match lossless_out pdf ~ pixel_threshold ~ length_threshold " .png " s dict reference with
| None -> ()
2024-12-18 14:25:10 +00:00
| Some ( _ , _ , _ , 4 , _ , _ ) -> if ! debug_image_processing then Printf . printf " lossless resampling for CMYK not supported yet \n %! "
2024-01-02 14:58:35 +00:00
| Some ( out , out2 , size , components , w , h ) ->
let retcode =
let command =
Filename . quote_command path_to_convert
2024-02-22 15:56:35 +00:00
( [ out ] @ ( if components = 4 then [ " -depth " ; " 8 " ; " -size " ; string_of_int w ^ " x " ^ string_of_int h ] else [] ) @
2024-01-02 14:58:35 +00:00
( if components = 1 then [ " -define " ; " png:color-type=0 " ; " -colorspace " ; " Gray " ] else if components = 3 then [ " -define " ; " -png:color-type=2 " ; " -colorspace " ; " RGB " ] else if components = 4 then [ " -colorspace " ; " CMYK " ] else [] ) @
2024-02-22 15:56:35 +00:00
[ if interpolate && components > - 2 then " -resize " else " -sample " ; string_of_float factor ^ " % " ; out2 ] )
2024-01-02 14:15:06 +00:00
in
2024-01-02 17:54:16 +00:00
(* Printf.printf "%S\n" command; *)
2024-01-02 14:58:35 +00:00
Sys . command command
2023-12-29 17:22:02 +00:00
in
2024-01-02 14:58:35 +00:00
try
2023-12-29 17:22:02 +00:00
if retcode = 0 then
begin
2024-01-02 14:58:35 +00:00
let result = open_in_bin out2 in
2023-12-29 17:22:02 +00:00
let newsize = in_channel_length result in
if newsize < size then
begin
reference :=
2024-01-01 19:09:40 +00:00
( match fst ( obj_of_png_data ( Pdfio . bytes_of_input_channel result ) ) with
2024-02-20 14:59:14 +00:00
| Pdf . Stream { contents = Pdf . Dictionary d , data } as s ->
let out_components = test_components pdf s in
let out_bpc = test_bpc pdf s in
2024-02-20 16:02:56 +00:00
(* Printf.printf " * * * OUT components = %i, bpc = %i\n" out_components out_bpc; *)
let rgb_to_grey_special =
let was_rgb =
match Pdf . lookup_direct pdf " /ColorSpace " dict with
| Some ( Pdf . Name ( " /DeviceRGB " | " /CalRGB " ) ) -> true
| _ -> false
in
in_bpc = out_bpc && in_components = 3 && out_components = 1 && was_rgb
in
(* Printf.printf " * * * rgb_to_grey_special = %b\n" rgb_to_grey_special; *)
if ( out_components < > in_components | | in_bpc < > out_bpc ) && not rgb_to_grey_special then
2024-02-20 14:59:14 +00:00
begin
if ! debug_image_processing then Printf . printf " wrong bpc / components returned. Skipping. \n %! " ;
! reference
end
else
begin
if ! debug_image_processing then Printf . printf " lossless resample %i -> %i (%i%%) \n %! " size newsize ( int_of_float ( float newsize /. float size * . 100 . ) ) ;
2024-02-20 16:02:56 +00:00
let d' = fold_right ( fun ( k , v ) d -> if k < > " /ColorSpace " | | rgb_to_grey_special then add k v d else d ) d ( match dict with Pdf . Dictionary x -> x | _ -> [] ) in
2024-02-20 14:59:14 +00:00
(* Printf.printf " * * * lossless_resample OUT dictionary: %S\n" ( Pdfwrite.string_of_pdf ( Pdf.Dictionary d' ) ) ; *)
( Pdf . Dictionary d' , data )
end
2024-01-01 19:09:40 +00:00
| _ -> assert false )
2023-12-29 21:49:56 +00:00
end
else
begin
if ! debug_image_processing then Printf . printf " no size reduction \n %! "
2023-12-29 17:22:02 +00:00
end ;
close_in result
2024-02-01 19:31:39 +00:00
end ;
remove out ;
remove out2
2024-12-18 14:20:29 +00:00
with e ->
2024-12-18 14:25:10 +00:00
if ! debug_image_processing then Printf . printf " Unable: %S \n " ( Printexc . to_string e ) ;
2024-02-20 19:41:49 +00:00
remove out ;
remove out2
2023-12-20 12:11:55 +00:00
2024-02-01 15:41:27 +00:00
let lossless_resample_target_dpi objnum pdf ~ pixel_threshold ~ length_threshold ~ factor ~ target_dpi_info ~ interpolate ~ path_to_convert s dict reference =
2024-11-06 20:03:43 +00:00
try
let real_factor = factor /. Hashtbl . find target_dpi_info objnum * . 100 . in
if real_factor < 100 . then
lossless_resample pdf ~ pixel_threshold ~ length_threshold ~ factor : real_factor ~ interpolate ~ path_to_convert s dict reference
else
if ! debug_image_processing then Printf . printf " failed to meet dpi target \n %! "
with
2024-12-18 14:25:10 +00:00
Not_found -> if ! debug_image_processing then Printf . printf " Warning: orphaned image, skipping \n " (* Could not find DPI data - an orphan image. *)
2024-02-01 14:29:41 +00:00
2024-11-06 16:18:57 +00:00
let jpeg_to_jpeg_wrapper objnum pdf ~ target_dpi_info ~ pixel_threshold ~ length_threshold ~ percentage_threshold ~ jpeg_to_jpeg_scale ~ jpeg_to_jpeg_dpi ~ interpolate ~ q ~ path_to_convert s dict reference =
if jpeg_to_jpeg_dpi = 0 . then
jpeg_to_jpeg pdf ~ pixel_threshold ~ length_threshold ~ percentage_threshold ~ jpeg_to_jpeg_scale ~ interpolate ~ q ~ path_to_convert s dict reference
else
2024-11-06 20:03:43 +00:00
try
let factor = jpeg_to_jpeg_dpi in
let real_factor = factor /. Hashtbl . find target_dpi_info objnum * . 100 . in
if real_factor < 100 . then
jpeg_to_jpeg pdf ~ pixel_threshold ~ length_threshold ~ percentage_threshold ~ jpeg_to_jpeg_scale : real_factor ~ interpolate ~ q ~ path_to_convert s dict reference
else
if ! debug_image_processing then Printf . printf " failed to meet dpi target \n %! "
with
2024-12-18 14:25:10 +00:00
Not_found -> if ! debug_image_processing then Printf . printf " Warning: orphaned image, skipping \n " (* Could not find DPI data - an orphan image. *)
2024-11-06 16:18:57 +00:00
2023-12-28 11:32:43 +00:00
let recompress_1bpp_jbig2_lossless ~ pixel_threshold ~ length_threshold ~ path_to_jbig2enc pdf s dict reference =
2024-02-20 18:55:20 +00:00
complain_jbig2enc path_to_jbig2enc ;
2024-01-03 17:43:51 +00:00
let old = ! reference in
let restore () = reference := old in
2023-12-24 13:54:21 +00:00
let w = match Pdf . lookup_direct pdf " /Width " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad width " in
let h = match Pdf . lookup_direct pdf " /Height " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad height " in
2023-12-29 21:49:56 +00:00
if w * h < pixel_threshold then ( if ! debug_image_processing then Printf . printf " pixel threshold not met \n %! " ) else (* ( but also, jbig2enc fails on tiny images ) *)
2023-12-22 21:21:23 +00:00
let size = match Pdf . lookup_direct pdf " /Length " dict with Some ( Pdf . Integer i ) -> i | _ -> 0 in
2023-12-31 11:13:58 +00:00
if size < length_threshold then ( if ! debug_image_processing then Printf . printf " length threshold not met \n %! " ) else
begin
Pdfcodec . decode_pdfstream_until_unknown pdf s ;
match Pdf . lookup_direct pdf " /Filter " ( fst ! reference ) with
2024-01-03 17:43:51 +00:00
| Some x ->
if ! debug_image_processing then Printf . printf " could not decode - skipping %s length %i \n %! " ( Pdfwrite . string_of_pdf x ) size ;
restore ()
2023-12-31 11:13:58 +00:00
| None ->
2024-02-21 18:41:26 +00:00
let out = Filename . temp_file " cpdf " " convertin.pnm " in
let out2 = Filename . temp_file " cpdf " " convertout.jbig2 " in
2023-12-31 11:13:58 +00:00
let fh = open_out_bin out in
let data = match s with Pdf . Stream { contents = _ , Pdf . Got d } -> d | _ -> assert false in
pnm_to_channel_1_inverted fh w h data ;
close_out fh ;
let retcode =
2024-01-12 15:00:28 +00:00
let command = Filename . quote_command ~ stdout : out2 path_to_jbig2enc [ " -d " ; " -p " ; out ] in
2023-12-31 11:13:58 +00:00
(* Printf.printf "%S\n" command; *) Sys . command command
in
2024-02-20 18:55:20 +00:00
if retcode < > 0 then
restore ()
else
2023-12-31 11:13:58 +00:00
begin
let result = open_in_bin out2 in
let newsize = in_channel_length result in
if newsize < size then
begin
if ! debug_image_processing then Printf . printf " 1bpp to JBIG2 %i -> %i (%i%%) \n %! " size newsize ( int_of_float ( float newsize /. float size * . 100 . ) ) ;
reference :=
( Pdf . remove_dict_entry
( Pdf . add_dict_entry
( Pdf . add_dict_entry dict " /Length " ( Pdf . Integer newsize ) )
" /Filter "
( Pdf . Name " /JBIG2Decode " ) ) " /DecodeParms " ) ,
Pdf . Got ( Pdfio . bytes_of_input_channel result )
end
else
begin
if ! debug_image_processing then Printf . printf " no size reduction \n %! "
end ;
close_in result
end ;
remove out ;
remove out2
end
2023-12-22 16:45:53 +00:00
2024-01-12 12:45:35 +00:00
(* Recompress 1bpp images ( except existing JBIG2 compressed ones ) to lossy jbig2 *)
2024-01-12 15:00:28 +00:00
let preprocess_jbig2_lossy ~ path_to_jbig2enc ~ jbig2_lossy_threshold ~ length_threshold ~ pixel_threshold ~ dpi_threshold inrange highdpi pdf =
2024-02-20 18:55:20 +00:00
complain_jbig2enc path_to_jbig2enc ;
2024-01-11 16:48:34 +00:00
let objnum_name_pairs = ref [] in
let process_obj objnum s =
2024-01-12 12:45:35 +00:00
match s with
| Pdf . Stream ( { contents = dict , _ } as reference ) ->
let old = ! reference in
let restore () = reference := old in
2024-02-01 16:38:07 +00:00
if Hashtbl . mem inrange objnum && ( dpi_threshold = 0 . | | Hashtbl . mem highdpi objnum ) then begin match
2024-01-12 12:45:35 +00:00
Pdf . lookup_direct pdf " /Subtype " dict ,
Pdf . lookup_direct pdf " /BitsPerComponent " dict ,
Pdf . lookup_direct pdf " /ImageMask " dict
with
| Some ( Pdf . Name " /Image " ) , Some ( Pdf . Integer 1 ) , _
| Some ( Pdf . Name " /Image " ) , _ , Some ( Pdf . Boolean true ) ->
let w = match Pdf . lookup_direct pdf " /Width " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad width " in
let h = match Pdf . lookup_direct pdf " /Height " dict with Some ( Pdf . Integer i ) -> i | _ -> error " bad height " in
if w * h < pixel_threshold then ( if ! debug_image_processing then Printf . printf " pixel threshold not met \n %! " ) else (* ( but also, jbig2enc fails on tiny images ) *)
let size = match Pdf . lookup_direct pdf " /Length " dict with Some ( Pdf . Integer i ) -> i | _ -> 0 in
if size < length_threshold then ( if ! debug_image_processing then Printf . printf " length threshold not met \n %! " ) else
begin
Pdfcodec . decode_pdfstream_until_unknown pdf s ;
match Pdf . lookup_direct pdf " /Filter " ( fst ! reference ) with
| Some x ->
if ! debug_image_processing then Printf . printf " could not decode - skipping %s length %i \n %! " ( Pdfwrite . string_of_pdf x ) size ;
restore ()
| None ->
2024-02-21 18:41:26 +00:00
let out = Filename . temp_file " cpdf " " convertin.pnm " in
2024-01-12 12:45:35 +00:00
let fh = open_out_bin out in
let data = match s with Pdf . Stream { contents = _ , Pdf . Got d } -> d | _ -> assert false in
pnm_to_channel_1_inverted fh w h data ;
close_out fh ;
2024-02-20 18:55:20 +00:00
if ! debug_image_processing then Printf . printf " JBIG2Lossy: obj %i is suitable \n %! " objnum ;
2024-01-12 12:45:35 +00:00
objnum_name_pairs := ( objnum , out ) :: ! objnum_name_pairs
end
| _ -> () (* not a 1bpp image *)
end
| _ -> () (* not a stream *)
2024-01-11 16:48:34 +00:00
in
2024-01-11 18:24:42 +00:00
Pdf . objiter process_obj pdf ;
if length ! objnum_name_pairs > 10000 then Pdfe . log " Too many jbig2 streams " else
2024-01-11 20:18:27 +00:00
if length ! objnum_name_pairs = 0 then () else
2024-01-11 18:24:42 +00:00
let jbig2out = Filename . temp_file " cpdf " " jbig2 " in
let retcode =
2024-01-12 13:06:29 +00:00
let command =
Filename . quote_command
path_to_jbig2enc
? stderr : ( if ! debug_image_processing then None else Some Filename . null )
2024-01-12 15:00:28 +00:00
( [ " -p " ; " -s " ; " -d " ; " -t " ; string_of_float jbig2_lossy_threshold ; " -b " ; jbig2out ] @ map snd ! objnum_name_pairs )
2024-01-12 13:06:29 +00:00
in
2024-01-11 18:24:42 +00:00
(* Printf.printf "%S\n" command; *) Sys . command command
in
2024-01-12 14:16:11 +00:00
iter remove ( map snd ! objnum_name_pairs ) ;
2024-01-11 18:24:42 +00:00
if retcode = 0 then
begin
let globals = bytes_of_string ( contents_of_file ( jbig2out ^ " .sym " ) ) in
let globalobj =
Pdf . addobj pdf ( Pdf . Stream { contents = Pdf . Dictionary [ ( " /Length " , Pdf . Integer ( bytes_size globals ) ) ] , Pdf . Got globals } )
in
2024-01-11 18:57:11 +00:00
iter2
( fun ( objnum , _ ) i ->
let data = bytes_of_string ( contents_of_file ( jbig2out ^ Printf . sprintf " .%04i " i ) ) in
let basic_obj =
Pdf . Stream
{ contents =
Pdf . Dictionary [ ( " /Length " , Pdf . Integer ( bytes_size data ) ) ;
( " /Filter " , Pdf . Name " /JBIG2Decode " ) ;
2024-01-11 19:36:54 +00:00
( " /DecodeParms " , Pdf . Dictionary [ ( " /JBIG2Globals " , Pdf . Indirect globalobj ) ] ) ] ,
2024-01-11 18:57:11 +00:00
Pdf . Got data }
in
let dict = match Pdf . lookup_obj pdf objnum with Pdf . Stream { contents = d , _ } -> d | _ -> Pdf . Dictionary [] in
Pdf . addobj_given_num pdf
( objnum ,
( match basic_obj with
| Pdf . Stream { contents = Pdf . Dictionary d , data } ->
let d' = fold_right ( fun ( k , v ) d -> add k v d ) d ( match dict with Pdf . Dictionary x -> x | _ -> [] ) in
Pdf . Stream { contents = Pdf . Dictionary d' , data }
| _ -> assert false ) ) )
! objnum_name_pairs
( indx0 ! objnum_name_pairs )
2024-01-11 18:24:42 +00:00
end
2024-01-12 14:16:11 +00:00
else
begin
Pdfe . log " Call to jbig2enc failed "
end ;
iter ( fun i -> remove ( jbig2out ^ Printf . sprintf " .%04i " i ) ) ( indx0 ! objnum_name_pairs ) ;
remove ( jbig2out ^ " .sym " )
2024-01-10 17:09:49 +00:00
2023-12-27 19:53:02 +00:00
let process
2024-02-01 16:38:07 +00:00
~ q ~ qlossless ~ onebppmethod ~ jbig2_lossy_threshold ~ length_threshold ~ percentage_threshold ~ pixel_threshold ~ dpi_threshold
2024-11-05 14:02:57 +00:00
~ factor ~ interpolate ~ jpeg_to_jpeg_scale ~ jpeg_to_jpeg_dpi ~ path_to_jbig2enc ~ path_to_convert range pdf
2023-12-27 19:53:02 +00:00
=
2024-01-04 11:33:17 +00:00
let inrange =
match images pdf range with
| ` List l -> hashset_of_list ( map ( function ` Assoc ( ( " Object " , ` Int i ) :: _ ) -> i | _ -> assert false ) l )
| _ -> assert false
in
2024-02-01 15:22:19 +00:00
let highdpi , target_dpi_info =
let objnums , dpi =
2024-11-06 20:03:43 +00:00
if dpi_threshold = 0 . && factor > 0 . && jpeg_to_jpeg_dpi = 0 . then ( [] , [] ) else
2024-01-04 19:31:21 +00:00
let results = image_resolution pdf range max_float in
2024-01-10 13:55:40 +00:00
(* iter ( fun ( _, _, _, _, wdpi, hdpi, objnum ) -> Printf.printf "From image_resolution %f %f %i\n" wdpi hdpi objnum ) results; *)
2024-01-04 19:31:21 +00:00
let cmp ( _ , _ , _ , _ , _ , _ , a ) ( _ , _ , _ , _ , _ , _ , b ) = compare a b in
let sets = collate cmp ( sort cmp results ) in
let heads = map hd ( map ( sort ( fun ( _ , _ , _ , _ , a , b , _ ) ( _ , _ , _ , _ , c , d , _ ) -> compare ( fmin a b ) ( fmin c d ) ) ) sets ) in
2024-01-10 13:55:40 +00:00
(* iter ( fun ( _, _, _, _, wdpi, hdpi, objnum ) -> Printf.printf "Lowest resolution exemplar %f %f %i\n" wdpi hdpi objnum ) heads; *)
2024-02-01 16:38:07 +00:00
let needed = keep ( fun ( _ , _ , _ , _ , wdpi , hdpi , objnum ) -> fmin wdpi hdpi > dpi_threshold ) heads in
2024-01-10 13:55:40 +00:00
(* iter ( fun ( _, _, _, _, wdpi, hdpi, objnum ) -> Printf.printf "keep %f %f %i\n" wdpi hdpi objnum ) needed; *)
2024-02-01 15:22:19 +00:00
map ( fun ( _ , _ , _ , _ , _ , _ , objnum ) -> objnum ) needed ,
map ( fun ( _ , _ , _ , _ , wdpi , hdpi , objnum ) -> ( objnum , fmin wdpi hdpi ) ) heads
2024-02-01 19:31:39 +00:00
(* iter ( fun ( x, d ) -> Printf.printf "obj %i at %f dpi\n" x d ) r; r *)
2024-01-04 19:31:21 +00:00
in
2024-02-01 15:41:27 +00:00
hashset_of_list objnums , hashtable_of_dictionary dpi
2024-01-04 19:31:21 +00:00
in
2024-02-01 16:38:07 +00:00
begin match onebppmethod with " JBIG2Lossy " -> preprocess_jbig2_lossy ~ path_to_jbig2enc ~ jbig2_lossy_threshold ~ dpi_threshold ~ length_threshold ~ pixel_threshold inrange highdpi pdf | _ -> () end ;
2023-12-29 20:09:50 +00:00
let nobjects = Pdf . objcard pdf in
let ndone = ref 0 in
2023-12-28 16:18:25 +00:00
let process_obj objnum s =
2023-12-17 12:58:32 +00:00
match s with
| Pdf . Stream ( { contents = dict , _ } as reference ) ->
2023-12-29 20:09:50 +00:00
ndone + = 1 ;
2024-02-01 16:38:07 +00:00
if Hashtbl . mem inrange objnum && ( dpi_threshold = 0 . | | Hashtbl . mem highdpi objnum ) then begin match
2023-12-22 16:12:19 +00:00
Pdf . lookup_direct pdf " /Subtype " dict ,
Pdf . lookup_direct pdf " /Filter " dict ,
Pdf . lookup_direct pdf " /BitsPerComponent " dict ,
Pdf . lookup_direct pdf " /ImageMask " dict
with
| Some ( Pdf . Name " /Image " ) , Some ( Pdf . Name " /DCTDecode " | Pdf . Array [ Pdf . Name " /DCTDecode " ] ) , _ , _ ->
2024-11-06 15:15:32 +00:00
if q < 100 . | | jpeg_to_jpeg_scale < > 100 . | | jpeg_to_jpeg_dpi < > 0 . then
2024-02-01 16:38:07 +00:00
begin
if ! debug_image_processing then Printf . printf " (%i/%i) Object %i (JPEG)... %! " ! ndone nobjects objnum ;
2024-11-06 16:18:57 +00:00
jpeg_to_jpeg_wrapper objnum pdf ~ target_dpi_info ~ pixel_threshold ~ length_threshold ~ percentage_threshold ~ jpeg_to_jpeg_scale ~ jpeg_to_jpeg_dpi ~ interpolate ~ q ~ path_to_convert s dict reference
2024-02-01 16:38:07 +00:00
end
2023-12-22 16:12:19 +00:00
| Some ( Pdf . Name " /Image " ) , _ , Some ( Pdf . Integer 1 ) , _
| Some ( Pdf . Name " /Image " ) , _ , _ , Some ( Pdf . Boolean true ) ->
2023-12-28 16:18:25 +00:00
begin match onebppmethod with
2024-02-01 16:38:07 +00:00
| " JBIG2 " ->
2023-12-29 21:49:56 +00:00
begin
2024-12-18 14:20:29 +00:00
if ! debug_image_processing then Printf . printf " (%i/%i) Object %i (1bpp)... %! " ! ndone nobjects objnum ;
2023-12-29 21:49:56 +00:00
recompress_1bpp_jbig2_lossless ~ pixel_threshold ~ length_threshold ~ path_to_jbig2enc pdf s dict reference
end
2023-12-28 16:18:25 +00:00
| _ -> ()
end
2023-12-22 16:12:19 +00:00
| Some ( Pdf . Name " /Image " ) , _ , _ , _ ->
2024-02-01 16:38:07 +00:00
if qlossless < 101 . then
begin
2024-12-18 14:20:29 +00:00
if ! debug_image_processing then Printf . printf " (%i/%i) Object %i (lossless)... %! " ! ndone nobjects objnum ;
2024-02-01 16:38:07 +00:00
lossless_to_jpeg pdf ~ pixel_threshold ~ length_threshold ~ percentage_threshold ~ qlossless ~ path_to_convert s dict reference
end
else
begin
if factor < 101 . then
2023-12-29 21:49:56 +00:00
begin
2024-12-18 14:20:29 +00:00
if ! debug_image_processing then Printf . printf " (%i/%i) Object %i (lossless)... %! " ! ndone nobjects objnum ;
2024-02-01 16:38:07 +00:00
if factor < 0 . then
lossless_resample_target_dpi objnum pdf ~ pixel_threshold ~ length_threshold ~ factor : ~ -. factor ~ target_dpi_info ~ interpolate ~ path_to_convert s dict reference
else
lossless_resample pdf ~ pixel_threshold ~ length_threshold ~ factor ~ interpolate ~ path_to_convert s dict reference
2023-12-29 21:49:56 +00:00
end
2024-02-01 16:38:07 +00:00
end
2023-12-17 12:58:32 +00:00
| _ -> () (* not an image *)
end
2023-12-29 20:09:50 +00:00
| _ -> ndone + = 1 (* not a stream *)
2023-12-07 14:54:47 +00:00
in
Pdf . objiter process_obj pdf