From 9bdeccb3431452f3b25bb8c4668c24ea73507caa Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 21 Dec 2021 15:25:59 +0000 Subject: [PATCH] more --- Makefile | 2 +- cpdf.ml | 320 +------------------------------------------------ cpdf.mli | 10 -- cpdfcommand.ml | 6 +- cpdfdraft.ml | 163 +++++++++++++++++++++++++ cpdfdraft.mli | 2 + cpdfimage.ml | 148 +++++++++++++++++++++++ cpdfimage.mli | 2 + 8 files changed, 320 insertions(+), 333 deletions(-) create mode 100644 cpdfdraft.ml create mode 100644 cpdfdraft.mli diff --git a/Makefile b/Makefile index 72641d2..8d0a700 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ MODS = cpdfyojson cpdfxmlm \ cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \ - cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot \ + cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \ cpdfpagelabels cpdfcreate cpdfannot cpdfcommand SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml diff --git a/cpdf.ml b/cpdf.ml index 32ee20f..696d6dc 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -1421,167 +1421,6 @@ let thinlines range width pdf = in Cpdfpage.process_pages (ppstub thinpage) pdf range - -(* \section{Making draft documents} *) - -(* Predicate on an xobject: true if an image xobject. *) -let isimage pdf (_, xobj) = - match Pdf.lookup_direct pdf "/Subtype" xobj with - | Some (Pdf.Name "/Image") -> true - | _ -> false - -(* Given a set of resources for a page, and the name of a resource, determine if -that name refers to an image xobject. *) -let xobject_isimage pdf resources name = - match resources with - | Pdf.Dictionary _ -> - begin match Pdf.lookup_direct pdf "/XObject" resources with - | Some xobjects -> - isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects) - | _ -> false - end - | _ -> failwith "bad resources" - -(* The subsitute for an image. *) -let substitute boxes = - if boxes then - rev - [Pdfops.Op_q; - Pdfops.Op_w 0.; - Pdfops.Op_G 0.; - Pdfops.Op_re (0., 0., 1., 1.); - Pdfops.Op_m (0., 0.); - Pdfops.Op_l (1., 1.); - Pdfops.Op_m (0., 1.); - Pdfops.Op_l (1., 0.); - Pdfops.Op_S; - Pdfops.Op_Q] - else - [] - -(* Remove references to images from a graphics stream. *) -let rec remove_images_stream onlyremove boxes pdf resources prev = function - | [] -> rev prev - | (Pdfops.Op_Do name) as h::t -> - if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name) - then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t - else remove_images_stream onlyremove boxes pdf resources (h::prev) t - | Pdfops.InlineImage _ as h::t -> - if onlyremove <> None - then remove_images_stream onlyremove boxes pdf resources (h::prev) t - else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t - | h::t -> - remove_images_stream onlyremove boxes pdf resources (h::prev) t - -let rec process_form_xobject onlyremove boxes pdf form = - let form = Pdf.direct pdf form in - let page = - {Pdfpage.content = [form]; - Pdfpage.mediabox = Pdf.Null; - Pdfpage.resources = - begin match Pdf.lookup_direct pdf "/Resources" form with - | Some r -> r - | None -> Pdf.Dictionary [] - end; - Pdfpage.rotate = Pdfpage.Rotate0; - Pdfpage.rest = Pdf.Dictionary []} - in - let page', pdf = - remove_images_page onlyremove boxes pdf page - in - let form' = - match form with - | Pdf.Stream {contents = (dict, _)} -> - begin match - Pdfops.stream_of_ops - (Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content) - with - | Pdf.Stream {contents = (_, Pdf.Got data)} -> - let dict' = - Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data)) - in - Pdf.Stream {contents = (dict', Pdf.Got data)} - | _ -> assert false - end - | _ -> raise (Pdf.PDFError "not a stream") - in - form', pdf - -(* Remove images from a page. *) -and remove_images_page onlyremove boxes pdf page = - let isform pdf xobj = - match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false - in - let isimage pdf xobj = - match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false - in - (* Remove image xobjects and look into form ones *) - let form_xobjects, image_xobjects = - match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with - | Some (Pdf.Dictionary elts) -> - keep (function (_, p) -> isform pdf p) elts, - keep (function (_, p) -> isimage pdf p) elts - | _ -> [], [] - in - let resources', pdf = - let names, pointers = split form_xobjects in - let form_xobjects', pdf = - let pdf = ref pdf - in let outputs = ref [] in - iter - (fun p -> - let p', pdf' = process_form_xobject onlyremove boxes !pdf p in - pdf := pdf'; - outputs =| p') - pointers; - rev !outputs, !pdf - in - let nums = ref [] in - iter - (fun xobj -> - let objnum = Pdf.addobj pdf xobj in - nums =| objnum) - form_xobjects'; - let image_xobjects' = - match onlyremove with - None -> [] - | Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects - in - let newdict = - Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums))) - in - Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf - in - let content' = - remove_images_stream onlyremove boxes pdf page.Pdfpage.resources [] - (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content) - in - {page with - Pdfpage.content = - (let stream = Pdfops.stream_of_ops content' in - Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream; - [stream]); - Pdfpage.resources = resources'}, pdf - -(* Remove images from all pages in a document. *) -let draft onlyremove boxes range pdf = - let pages = Pdfpage.pages_of_pagetree pdf in - let pagenums = indx pages in - let pdf = ref pdf - in let pages' = ref [] in - iter2 - (fun p pagenum -> - let p', pdf' = - if mem pagenum range - then remove_images_page onlyremove boxes !pdf p - else p, !pdf - in - pdf := pdf'; - pages' =| p') - pages - pagenums; - Pdfpage.change_pages true !pdf (rev !pages') - (* Parse the new content to make sure syntactically ok, append * as required. Rewrite the content *) let append_page_content_page fast s before pdf n page = @@ -1775,154 +1614,7 @@ let remove_clipping pdf range = {page with Pdfpage.content = content'} in Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range - -(* Image resolution *) -type xobj = - | Image of int * int (* width, height *) - | Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *) - -let image_results = ref [] - -let add_image_result i = - image_results := i::!image_results - -(* Given a page and a list of (pagenum, name, thing) *) -let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) = - try - let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content - and transform = ref [ref Pdftransform.i_matrix] in - iter - (function - | Pdfops.Op_cm matrix -> - begin match !transform with - | [] -> raise (Failure "no transform") - | _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix - end - | Pdfops.Op_Do xobject -> - let trans (x, y) = - match !transform with - | [] -> raise (Failure "no transform") - | _ -> Pdftransform.transform_matrix !(hd !transform) (x, y) - in - let o = trans (0., 0.) - and x = trans (1., 0.) - and y = trans (0., 1.) - in - (*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*) - let rec lookup_image k = function - | [] -> assert false - | (_, a, _) as h::_ when a = k -> h - | _::t -> lookup_image k t - in - begin match lookup_image xobject images with - | (pagenum, name, Form (xobj_matrix, content, resources)) -> - let content = - (* Add in matrix etc. *) - let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in - let ops = - Pdfops.Op_cm total_matrix:: - Pdfops.parse_operators pdf resources [content] - in - Pdfops.stream_of_ops ops - in - let page = - {Pdfpage.content = [content]; - Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4; - Pdfpage.resources = resources; - Pdfpage.rotate = Pdfpage.Rotate0; - Pdfpage.rest = Pdf.Dictionary []} - in - let newpdf = Pdfpage.change_pages false pdf [page] in - image_resolution newpdf [pagenum] dpi - | (pagenum, name, Image (w, h)) -> - let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x) - and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in - let wdpi = float w /. lx - and hdpi = float h /. ly in - add_image_result (pagenum, xobject, w, h, wdpi, hdpi) - (*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*) - (*i else - Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*) - end - | Pdfops.Op_q -> - begin match !transform with - | [] -> raise (Failure "Unbalanced q/Q ops") - | h::t -> - let h' = ref Pdftransform.i_matrix in - h' := !h; - transform := h'::h::t - end - | Pdfops.Op_Q -> - begin match !transform with - | [] -> raise (Failure "Unbalanced q/Q ops") - | _ -> transform := tl !transform - end - | _ -> ()) - pageops - with - e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n" - -and image_resolution pdf range dpi = - let images = ref [] in - Cpdfpage.iter_pages - (fun pagenum page -> - (* 1. Get all image names and their native resolutions from resources as string * int * int *) - match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with - | Some (Pdf.Dictionary xobjects) -> - iter - (function (name, xobject) -> - match Pdf.lookup_direct pdf "/Subtype" xobject with - | Some (Pdf.Name "/Image") -> - let width = - match Pdf.lookup_direct pdf "/Width" xobject with - | Some x -> Pdf.getnum x - | None -> 1. - and height = - match Pdf.lookup_direct pdf "/Height" xobject with - | Some x -> Pdf.getnum x - | None -> 1. - in - images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images - | Some (Pdf.Name "/Form") -> - let resources = - match Pdf.lookup_direct pdf "/Resources" xobject with - | None -> page.Pdfpage.resources (* Inherit from page or form above. *) - | Some r -> r - and contents = - xobject - and matrix = - match Pdf.lookup_direct pdf "/Matrix" xobject with - | Some (Pdf.Array [a; b; c; d; e; f]) -> - {Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c; - Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f} - | _ -> Pdftransform.i_matrix - in - images := (pagenum, name, Form (matrix, contents, resources))::!images - | _ -> () - ) - xobjects - | _ -> ()) - pdf - range; - (* Now, split into differing pages, and call [image_resolution_page] on each one *) - let pagesplits = - map - (function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false) - (collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images)) - and pages = - Pdfpage.pages_of_pagetree pdf - in - iter - (function (pagenum, images) -> - let page = select pagenum pages in - image_resolution_page pdf page pagenum dpi images) - pagesplits - -let image_resolution pdf range dpi = - image_results := []; - image_resolution pdf range dpi; - rev !image_results - + (* copy the contents of the box f to the box t. If mediabox_if_missing is set, the contents of the mediabox will be used if the from fox is not available. If mediabox_is_missing is false, the page is unaltered. *) @@ -1964,13 +1656,3 @@ let remove_unused_resources_page pdf n page = let remove_unused_resources pdf = Cpdfpage.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf)) - - -let create_pdf pages pagesize = - let page = - {(Pdfpage.blankpage pagesize) with - Pdfpage.content = [Pdfops.stream_of_ops []]; - Pdfpage.resources = Pdf.Dictionary []} - in - let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in - Pdfpage.add_root pageroot [] pdf diff --git a/cpdf.mli b/cpdf.mli index 42241ff..ef5af3a 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -113,9 +113,6 @@ val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t (** Make all fills on certain pages black. *) val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t -(** Remove images from a PDF, optionally adding crossed boxes. *) -val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t - val remove_all_text : int list -> Pdf.t -> Pdf.t (**/**) @@ -134,13 +131,6 @@ val print_dict_entry : Pdf.t -> string -> unit val remove_clipping : Pdf.t -> int list -> Pdf.t -val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list - val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t - val remove_unused_resources : Pdf.t -> Pdf.t - - - -val create_pdf : int -> Pdfpaper.t -> Pdf.t diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 40639b4..cc919dc 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -3649,7 +3649,7 @@ let go () = | Some Draft -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - write_pdf false (Cpdf.draft args.removeonly args.boxes range pdf) + write_pdf false (Cpdfdraft.draft args.removeonly args.boxes range pdf) | Some (AddText text) -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in @@ -3778,7 +3778,7 @@ let go () = | Some (ImageResolution f) -> let pdf = get_single_pdf args.op true in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - let images = Cpdf.image_resolution pdf range f in + let images = Cpdfimage.image_resolution pdf range f in iter (function (pagenum, xobject, w, h, wdpi, hdpi) -> if wdpi < f || hdpi < f then @@ -3854,7 +3854,7 @@ let go () = let pdf = get_single_pdf args.op false in write_pdf false (Cpdfbookmarks.bookmarks_open_to_level n pdf) | Some CreatePDF -> - let pdf = Cpdf.create_pdf args.createpdf_pages args.createpdf_pagesize in + let pdf = Cpdfcreate.blank_document_paper args.createpdf_pagesize args.createpdf_pages in write_pdf false pdf | Some RemoveAllText -> let pdf = get_single_pdf args.op false in diff --git a/cpdfdraft.ml b/cpdfdraft.ml new file mode 100644 index 0000000..bf8b7fc --- /dev/null +++ b/cpdfdraft.ml @@ -0,0 +1,163 @@ +open Pdfutil +open Pdfio + +(* \section{Making draft documents} *) + +(* Predicate on an xobject: true if an image xobject. *) +let isimage pdf (_, xobj) = + match Pdf.lookup_direct pdf "/Subtype" xobj with + | Some (Pdf.Name "/Image") -> true + | _ -> false + +(* Given a set of resources for a page, and the name of a resource, determine if +that name refers to an image xobject. *) +let xobject_isimage pdf resources name = + match resources with + | Pdf.Dictionary _ -> + begin match Pdf.lookup_direct pdf "/XObject" resources with + | Some xobjects -> + isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects) + | _ -> false + end + | _ -> failwith "bad resources" + +(* The subsitute for an image. *) +let substitute boxes = + if boxes then + rev + [Pdfops.Op_q; + Pdfops.Op_w 0.; + Pdfops.Op_G 0.; + Pdfops.Op_re (0., 0., 1., 1.); + Pdfops.Op_m (0., 0.); + Pdfops.Op_l (1., 1.); + Pdfops.Op_m (0., 1.); + Pdfops.Op_l (1., 0.); + Pdfops.Op_S; + Pdfops.Op_Q] + else + [] + +(* Remove references to images from a graphics stream. *) +let rec remove_images_stream onlyremove boxes pdf resources prev = function + | [] -> rev prev + | (Pdfops.Op_Do name) as h::t -> + if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name) + then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t + else remove_images_stream onlyremove boxes pdf resources (h::prev) t + | Pdfops.InlineImage _ as h::t -> + if onlyremove <> None + then remove_images_stream onlyremove boxes pdf resources (h::prev) t + else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t + | h::t -> + remove_images_stream onlyremove boxes pdf resources (h::prev) t + +let rec process_form_xobject onlyremove boxes pdf form = + let form = Pdf.direct pdf form in + let page = + {Pdfpage.content = [form]; + Pdfpage.mediabox = Pdf.Null; + Pdfpage.resources = + begin match Pdf.lookup_direct pdf "/Resources" form with + | Some r -> r + | None -> Pdf.Dictionary [] + end; + Pdfpage.rotate = Pdfpage.Rotate0; + Pdfpage.rest = Pdf.Dictionary []} + in + let page', pdf = + remove_images_page onlyremove boxes pdf page + in + let form' = + match form with + | Pdf.Stream {contents = (dict, _)} -> + begin match + Pdfops.stream_of_ops + (Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content) + with + | Pdf.Stream {contents = (_, Pdf.Got data)} -> + let dict' = + Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data)) + in + Pdf.Stream {contents = (dict', Pdf.Got data)} + | _ -> assert false + end + | _ -> raise (Pdf.PDFError "not a stream") + in + form', pdf + +(* Remove images from a page. *) +and remove_images_page onlyremove boxes pdf page = + let isform pdf xobj = + match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false + in + let isimage pdf xobj = + match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false + in + (* Remove image xobjects and look into form ones *) + let form_xobjects, image_xobjects = + match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with + | Some (Pdf.Dictionary elts) -> + keep (function (_, p) -> isform pdf p) elts, + keep (function (_, p) -> isimage pdf p) elts + | _ -> [], [] + in + let resources', pdf = + let names, pointers = split form_xobjects in + let form_xobjects', pdf = + let pdf = ref pdf + in let outputs = ref [] in + iter + (fun p -> + let p', pdf' = process_form_xobject onlyremove boxes !pdf p in + pdf := pdf'; + outputs =| p') + pointers; + rev !outputs, !pdf + in + let nums = ref [] in + iter + (fun xobj -> + let objnum = Pdf.addobj pdf xobj in + nums =| objnum) + form_xobjects'; + let image_xobjects' = + match onlyremove with + None -> [] + | Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects + in + let newdict = + Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums))) + in + Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf + in + let content' = + remove_images_stream onlyremove boxes pdf page.Pdfpage.resources [] + (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content) + in + {page with + Pdfpage.content = + (let stream = Pdfops.stream_of_ops content' in + Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream; + [stream]); + Pdfpage.resources = resources'}, pdf + +(* Remove images from all pages in a document. *) +let draft onlyremove boxes range pdf = + let pages = Pdfpage.pages_of_pagetree pdf in + let pagenums = indx pages in + let pdf = ref pdf + in let pages' = ref [] in + iter2 + (fun p pagenum -> + let p', pdf' = + if mem pagenum range + then remove_images_page onlyremove boxes !pdf p + else p, !pdf + in + pdf := pdf'; + pages' =| p') + pages + pagenums; + Pdfpage.change_pages true !pdf (rev !pages') + diff --git a/cpdfdraft.mli b/cpdfdraft.mli new file mode 100644 index 0000000..f913749 --- /dev/null +++ b/cpdfdraft.mli @@ -0,0 +1,2 @@ +(** Remove images from a PDF, optionally adding crossed boxes. *) +val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t diff --git a/cpdfimage.ml b/cpdfimage.ml index 7dd09bb..8294161 100644 --- a/cpdfimage.ml +++ b/cpdfimage.ml @@ -125,3 +125,151 @@ let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf rang iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms) pages (indx pages) + +(* Image resolution *) +type xobj = + | Image of int * int (* width, height *) + | Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *) + +let image_results = ref [] + +let add_image_result i = + image_results := i::!image_results + +(* Given a page and a list of (pagenum, name, thing) *) +let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) = + try + let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content + and transform = ref [ref Pdftransform.i_matrix] in + iter + (function + | Pdfops.Op_cm matrix -> + begin match !transform with + | [] -> raise (Failure "no transform") + | _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix + end + | Pdfops.Op_Do xobject -> + let trans (x, y) = + match !transform with + | [] -> raise (Failure "no transform") + | _ -> Pdftransform.transform_matrix !(hd !transform) (x, y) + in + let o = trans (0., 0.) + and x = trans (1., 0.) + and y = trans (0., 1.) + in + (*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*) + let rec lookup_image k = function + | [] -> assert false + | (_, a, _) as h::_ when a = k -> h + | _::t -> lookup_image k t + in + begin match lookup_image xobject images with + | (pagenum, name, Form (xobj_matrix, content, resources)) -> + let content = + (* Add in matrix etc. *) + let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in + let ops = + Pdfops.Op_cm total_matrix:: + Pdfops.parse_operators pdf resources [content] + in + Pdfops.stream_of_ops ops + in + let page = + {Pdfpage.content = [content]; + Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4; + Pdfpage.resources = resources; + Pdfpage.rotate = Pdfpage.Rotate0; + Pdfpage.rest = Pdf.Dictionary []} + in + let newpdf = Pdfpage.change_pages false pdf [page] in + image_resolution newpdf [pagenum] dpi + | (pagenum, name, Image (w, h)) -> + let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x) + and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in + let wdpi = float w /. lx + and hdpi = float h /. ly in + add_image_result (pagenum, xobject, w, h, wdpi, hdpi) + (*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*) + (*i else + Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*) + end + | Pdfops.Op_q -> + begin match !transform with + | [] -> raise (Failure "Unbalanced q/Q ops") + | h::t -> + let h' = ref Pdftransform.i_matrix in + h' := !h; + transform := h'::h::t + end + | Pdfops.Op_Q -> + begin match !transform with + | [] -> raise (Failure "Unbalanced q/Q ops") + | _ -> transform := tl !transform + end + | _ -> ()) + pageops + with + e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n" + +and image_resolution pdf range dpi = + let images = ref [] in + Cpdfpage.iter_pages + (fun pagenum page -> + (* 1. Get all image names and their native resolutions from resources as string * int * int *) + match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with + | Some (Pdf.Dictionary xobjects) -> + iter + (function (name, xobject) -> + match Pdf.lookup_direct pdf "/Subtype" xobject with + | Some (Pdf.Name "/Image") -> + let width = + match Pdf.lookup_direct pdf "/Width" xobject with + | Some x -> Pdf.getnum x + | None -> 1. + and height = + match Pdf.lookup_direct pdf "/Height" xobject with + | Some x -> Pdf.getnum x + | None -> 1. + in + images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images + | Some (Pdf.Name "/Form") -> + let resources = + match Pdf.lookup_direct pdf "/Resources" xobject with + | None -> page.Pdfpage.resources (* Inherit from page or form above. *) + | Some r -> r + and contents = + xobject + and matrix = + match Pdf.lookup_direct pdf "/Matrix" xobject with + | Some (Pdf.Array [a; b; c; d; e; f]) -> + {Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c; + Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f} + | _ -> Pdftransform.i_matrix + in + images := (pagenum, name, Form (matrix, contents, resources))::!images + | _ -> () + ) + xobjects + | _ -> ()) + pdf + range; + (* Now, split into differing pages, and call [image_resolution_page] on each one *) + let pagesplits = + map + (function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false) + (collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images)) + and pages = + Pdfpage.pages_of_pagetree pdf + in + iter + (function (pagenum, images) -> + let page = select pagenum pages in + image_resolution_page pdf page pagenum dpi images) + pagesplits + +let image_resolution pdf range dpi = + image_results := []; + image_resolution pdf range dpi; + rev !image_results + diff --git a/cpdfimage.mli b/cpdfimage.mli index 9e39778..5383dcb 100644 --- a/cpdfimage.mli +++ b/cpdfimage.mli @@ -1,3 +1,5 @@ val extract_images : string -> string -> Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit + +val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list