This commit is contained in:
John Whitington 2021-12-21 15:25:59 +00:00
parent c711e3aa77
commit 9bdeccb343
8 changed files with 320 additions and 333 deletions

View File

@ -3,7 +3,7 @@ MODS = cpdfyojson cpdfxmlm \
cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \ cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot \ cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \
cpdfpagelabels cpdfcreate cpdfannot cpdfcommand cpdfpagelabels cpdfcreate cpdfannot cpdfcommand
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml

320
cpdf.ml
View File

@ -1421,167 +1421,6 @@ let thinlines range width pdf =
in in
Cpdfpage.process_pages (ppstub thinpage) pdf range Cpdfpage.process_pages (ppstub thinpage) pdf range
(* \section{Making draft documents} *)
(* Predicate on an xobject: true if an image xobject. *)
let isimage pdf (_, xobj) =
match Pdf.lookup_direct pdf "/Subtype" xobj with
| Some (Pdf.Name "/Image") -> true
| _ -> false
(* Given a set of resources for a page, and the name of a resource, determine if
that name refers to an image xobject. *)
let xobject_isimage pdf resources name =
match resources with
| Pdf.Dictionary _ ->
begin match Pdf.lookup_direct pdf "/XObject" resources with
| Some xobjects ->
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
| _ -> false
end
| _ -> failwith "bad resources"
(* The subsitute for an image. *)
let substitute boxes =
if boxes then
rev
[Pdfops.Op_q;
Pdfops.Op_w 0.;
Pdfops.Op_G 0.;
Pdfops.Op_re (0., 0., 1., 1.);
Pdfops.Op_m (0., 0.);
Pdfops.Op_l (1., 1.);
Pdfops.Op_m (0., 1.);
Pdfops.Op_l (1., 0.);
Pdfops.Op_S;
Pdfops.Op_Q]
else
[]
(* Remove references to images from a graphics stream. *)
let rec remove_images_stream onlyremove boxes pdf resources prev = function
| [] -> rev prev
| (Pdfops.Op_Do name) as h::t ->
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
| Pdfops.InlineImage _ as h::t ->
if onlyremove <> None
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
| h::t ->
remove_images_stream onlyremove boxes pdf resources (h::prev) t
let rec process_form_xobject onlyremove boxes pdf form =
let form = Pdf.direct pdf form in
let page =
{Pdfpage.content = [form];
Pdfpage.mediabox = Pdf.Null;
Pdfpage.resources =
begin match Pdf.lookup_direct pdf "/Resources" form with
| Some r -> r
| None -> Pdf.Dictionary []
end;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let page', pdf =
remove_images_page onlyremove boxes pdf page
in
let form' =
match form with
| Pdf.Stream {contents = (dict, _)} ->
begin match
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
with
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
let dict' =
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
in
Pdf.Stream {contents = (dict', Pdf.Got data)}
| _ -> assert false
end
| _ -> raise (Pdf.PDFError "not a stream")
in
form', pdf
(* Remove images from a page. *)
and remove_images_page onlyremove boxes pdf page =
let isform pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
in
let isimage pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
in
(* Remove image xobjects and look into form ones *)
let form_xobjects, image_xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
keep (function (_, p) -> isform pdf p) elts,
keep (function (_, p) -> isimage pdf p) elts
| _ -> [], []
in
let resources', pdf =
let names, pointers = split form_xobjects in
let form_xobjects', pdf =
let pdf = ref pdf
in let outputs = ref [] in
iter
(fun p ->
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
pdf := pdf';
outputs =| p')
pointers;
rev !outputs, !pdf
in
let nums = ref [] in
iter
(fun xobj ->
let objnum = Pdf.addobj pdf xobj in
nums =| objnum)
form_xobjects';
let image_xobjects' =
match onlyremove with
None -> []
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
in
let newdict =
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
in
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
in
let content' =
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
in
{page with
Pdfpage.content =
(let stream = Pdfops.stream_of_ops content' in
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
[stream]);
Pdfpage.resources = resources'}, pdf
(* Remove images from all pages in a document. *)
let draft onlyremove boxes range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf
in let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_images_page onlyremove boxes !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')
(* Parse the new content to make sure syntactically ok, append (* Parse the new content to make sure syntactically ok, append
* as required. Rewrite the content *) * as required. Rewrite the content *)
let append_page_content_page fast s before pdf n page = let append_page_content_page fast s before pdf n page =
@ -1775,154 +1614,7 @@ let remove_clipping pdf range =
{page with Pdfpage.content = content'} {page with Pdfpage.content = content'}
in in
Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range
(* Image resolution *)
type xobj =
| Image of int * int (* width, height *)
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
let image_results = ref []
let add_image_result i =
image_results := i::!image_results
(* Given a page and a list of (pagenum, name, thing) *)
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
try
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
and transform = ref [ref Pdftransform.i_matrix] in
iter
(function
| Pdfops.Op_cm matrix ->
begin match !transform with
| [] -> raise (Failure "no transform")
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
end
| Pdfops.Op_Do xobject ->
let trans (x, y) =
match !transform with
| [] -> raise (Failure "no transform")
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
in
let o = trans (0., 0.)
and x = trans (1., 0.)
and y = trans (0., 1.)
in
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
let rec lookup_image k = function
| [] -> assert false
| (_, a, _) as h::_ when a = k -> h
| _::t -> lookup_image k t
in
begin match lookup_image xobject images with
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
let content =
(* Add in matrix etc. *)
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
let ops =
Pdfops.Op_cm total_matrix::
Pdfops.parse_operators pdf resources [content]
in
Pdfops.stream_of_ops ops
in
let page =
{Pdfpage.content = [content];
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
Pdfpage.resources = resources;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let newpdf = Pdfpage.change_pages false pdf [page] in
image_resolution newpdf [pagenum] dpi
| (pagenum, name, Image (w, h)) ->
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
let wdpi = float w /. lx
and hdpi = float h /. ly in
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
(*i else
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
end
| Pdfops.Op_q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| h::t ->
let h' = ref Pdftransform.i_matrix in
h' := !h;
transform := h'::h::t
end
| Pdfops.Op_Q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| _ -> transform := tl !transform
end
| _ -> ())
pageops
with
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
and image_resolution pdf range dpi =
let images = ref [] in
Cpdfpage.iter_pages
(fun pagenum page ->
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjects) ->
iter
(function (name, xobject) ->
match Pdf.lookup_direct pdf "/Subtype" xobject with
| Some (Pdf.Name "/Image") ->
let width =
match Pdf.lookup_direct pdf "/Width" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
and height =
match Pdf.lookup_direct pdf "/Height" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
in
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
| Some (Pdf.Name "/Form") ->
let resources =
match Pdf.lookup_direct pdf "/Resources" xobject with
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
| Some r -> r
and contents =
xobject
and matrix =
match Pdf.lookup_direct pdf "/Matrix" xobject with
| Some (Pdf.Array [a; b; c; d; e; f]) ->
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
| _ -> Pdftransform.i_matrix
in
images := (pagenum, name, Form (matrix, contents, resources))::!images
| _ -> ()
)
xobjects
| _ -> ())
pdf
range;
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
let pagesplits =
map
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
and pages =
Pdfpage.pages_of_pagetree pdf
in
iter
(function (pagenum, images) ->
let page = select pagenum pages in
image_resolution_page pdf page pagenum dpi images)
pagesplits
let image_resolution pdf range dpi =
image_results := [];
image_resolution pdf range dpi;
rev !image_results
(* copy the contents of the box f to the box t. If mediabox_if_missing is set, (* copy the contents of the box f to the box t. If mediabox_if_missing is set,
the contents of the mediabox will be used if the from fox is not available. If the contents of the mediabox will be used if the from fox is not available. If
mediabox_is_missing is false, the page is unaltered. *) mediabox_is_missing is false, the page is unaltered. *)
@ -1964,13 +1656,3 @@ let remove_unused_resources_page pdf n page =
let remove_unused_resources pdf = let remove_unused_resources pdf =
Cpdfpage.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf)) Cpdfpage.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
let create_pdf pages pagesize =
let page =
{(Pdfpage.blankpage pagesize) with
Pdfpage.content = [Pdfops.stream_of_ops []];
Pdfpage.resources = Pdf.Dictionary []}
in
let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf

View File

@ -113,9 +113,6 @@ val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
(** Make all fills on certain pages black. *) (** Make all fills on certain pages black. *)
val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
(** Remove images from a PDF, optionally adding crossed boxes. *)
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
val remove_all_text : int list -> Pdf.t -> Pdf.t val remove_all_text : int list -> Pdf.t -> Pdf.t
(**/**) (**/**)
@ -134,13 +131,6 @@ val print_dict_entry : Pdf.t -> string -> unit
val remove_clipping : Pdf.t -> int list -> Pdf.t val remove_clipping : Pdf.t -> int list -> Pdf.t
val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list
val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t
val remove_unused_resources : Pdf.t -> Pdf.t val remove_unused_resources : Pdf.t -> Pdf.t
val create_pdf : int -> Pdfpaper.t -> Pdf.t

View File

@ -3649,7 +3649,7 @@ let go () =
| Some Draft -> | Some Draft ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
write_pdf false (Cpdf.draft args.removeonly args.boxes range pdf) write_pdf false (Cpdfdraft.draft args.removeonly args.boxes range pdf)
| Some (AddText text) -> | Some (AddText text) ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
@ -3778,7 +3778,7 @@ let go () =
| Some (ImageResolution f) -> | Some (ImageResolution f) ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
let images = Cpdf.image_resolution pdf range f in let images = Cpdfimage.image_resolution pdf range f in
iter iter
(function (pagenum, xobject, w, h, wdpi, hdpi) -> (function (pagenum, xobject, w, h, wdpi, hdpi) ->
if wdpi < f || hdpi < f then if wdpi < f || hdpi < f then
@ -3854,7 +3854,7 @@ let go () =
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
write_pdf false (Cpdfbookmarks.bookmarks_open_to_level n pdf) write_pdf false (Cpdfbookmarks.bookmarks_open_to_level n pdf)
| Some CreatePDF -> | Some CreatePDF ->
let pdf = Cpdf.create_pdf args.createpdf_pages args.createpdf_pagesize in let pdf = Cpdfcreate.blank_document_paper args.createpdf_pagesize args.createpdf_pages in
write_pdf false pdf write_pdf false pdf
| Some RemoveAllText -> | Some RemoveAllText ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in

163
cpdfdraft.ml Normal file
View File

@ -0,0 +1,163 @@
open Pdfutil
open Pdfio
(* \section{Making draft documents} *)
(* Predicate on an xobject: true if an image xobject. *)
let isimage pdf (_, xobj) =
match Pdf.lookup_direct pdf "/Subtype" xobj with
| Some (Pdf.Name "/Image") -> true
| _ -> false
(* Given a set of resources for a page, and the name of a resource, determine if
that name refers to an image xobject. *)
let xobject_isimage pdf resources name =
match resources with
| Pdf.Dictionary _ ->
begin match Pdf.lookup_direct pdf "/XObject" resources with
| Some xobjects ->
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
| _ -> false
end
| _ -> failwith "bad resources"
(* The subsitute for an image. *)
let substitute boxes =
if boxes then
rev
[Pdfops.Op_q;
Pdfops.Op_w 0.;
Pdfops.Op_G 0.;
Pdfops.Op_re (0., 0., 1., 1.);
Pdfops.Op_m (0., 0.);
Pdfops.Op_l (1., 1.);
Pdfops.Op_m (0., 1.);
Pdfops.Op_l (1., 0.);
Pdfops.Op_S;
Pdfops.Op_Q]
else
[]
(* Remove references to images from a graphics stream. *)
let rec remove_images_stream onlyremove boxes pdf resources prev = function
| [] -> rev prev
| (Pdfops.Op_Do name) as h::t ->
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
| Pdfops.InlineImage _ as h::t ->
if onlyremove <> None
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
| h::t ->
remove_images_stream onlyremove boxes pdf resources (h::prev) t
let rec process_form_xobject onlyremove boxes pdf form =
let form = Pdf.direct pdf form in
let page =
{Pdfpage.content = [form];
Pdfpage.mediabox = Pdf.Null;
Pdfpage.resources =
begin match Pdf.lookup_direct pdf "/Resources" form with
| Some r -> r
| None -> Pdf.Dictionary []
end;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let page', pdf =
remove_images_page onlyremove boxes pdf page
in
let form' =
match form with
| Pdf.Stream {contents = (dict, _)} ->
begin match
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
with
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
let dict' =
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
in
Pdf.Stream {contents = (dict', Pdf.Got data)}
| _ -> assert false
end
| _ -> raise (Pdf.PDFError "not a stream")
in
form', pdf
(* Remove images from a page. *)
and remove_images_page onlyremove boxes pdf page =
let isform pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
in
let isimage pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
in
(* Remove image xobjects and look into form ones *)
let form_xobjects, image_xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
keep (function (_, p) -> isform pdf p) elts,
keep (function (_, p) -> isimage pdf p) elts
| _ -> [], []
in
let resources', pdf =
let names, pointers = split form_xobjects in
let form_xobjects', pdf =
let pdf = ref pdf
in let outputs = ref [] in
iter
(fun p ->
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
pdf := pdf';
outputs =| p')
pointers;
rev !outputs, !pdf
in
let nums = ref [] in
iter
(fun xobj ->
let objnum = Pdf.addobj pdf xobj in
nums =| objnum)
form_xobjects';
let image_xobjects' =
match onlyremove with
None -> []
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
in
let newdict =
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
in
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
in
let content' =
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
in
{page with
Pdfpage.content =
(let stream = Pdfops.stream_of_ops content' in
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
[stream]);
Pdfpage.resources = resources'}, pdf
(* Remove images from all pages in a document. *)
let draft onlyremove boxes range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf
in let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_images_page onlyremove boxes !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')

2
cpdfdraft.mli Normal file
View File

@ -0,0 +1,2 @@
(** Remove images from a PDF, optionally adding crossed boxes. *)
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t

View File

@ -125,3 +125,151 @@ let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf rang
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms) iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
pages pages
(indx pages) (indx pages)
(* Image resolution *)
type xobj =
| Image of int * int (* width, height *)
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
let image_results = ref []
let add_image_result i =
image_results := i::!image_results
(* Given a page and a list of (pagenum, name, thing) *)
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
try
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
and transform = ref [ref Pdftransform.i_matrix] in
iter
(function
| Pdfops.Op_cm matrix ->
begin match !transform with
| [] -> raise (Failure "no transform")
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
end
| Pdfops.Op_Do xobject ->
let trans (x, y) =
match !transform with
| [] -> raise (Failure "no transform")
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
in
let o = trans (0., 0.)
and x = trans (1., 0.)
and y = trans (0., 1.)
in
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
let rec lookup_image k = function
| [] -> assert false
| (_, a, _) as h::_ when a = k -> h
| _::t -> lookup_image k t
in
begin match lookup_image xobject images with
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
let content =
(* Add in matrix etc. *)
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
let ops =
Pdfops.Op_cm total_matrix::
Pdfops.parse_operators pdf resources [content]
in
Pdfops.stream_of_ops ops
in
let page =
{Pdfpage.content = [content];
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
Pdfpage.resources = resources;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let newpdf = Pdfpage.change_pages false pdf [page] in
image_resolution newpdf [pagenum] dpi
| (pagenum, name, Image (w, h)) ->
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
let wdpi = float w /. lx
and hdpi = float h /. ly in
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
(*i else
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
end
| Pdfops.Op_q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| h::t ->
let h' = ref Pdftransform.i_matrix in
h' := !h;
transform := h'::h::t
end
| Pdfops.Op_Q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| _ -> transform := tl !transform
end
| _ -> ())
pageops
with
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
and image_resolution pdf range dpi =
let images = ref [] in
Cpdfpage.iter_pages
(fun pagenum page ->
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjects) ->
iter
(function (name, xobject) ->
match Pdf.lookup_direct pdf "/Subtype" xobject with
| Some (Pdf.Name "/Image") ->
let width =
match Pdf.lookup_direct pdf "/Width" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
and height =
match Pdf.lookup_direct pdf "/Height" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
in
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
| Some (Pdf.Name "/Form") ->
let resources =
match Pdf.lookup_direct pdf "/Resources" xobject with
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
| Some r -> r
and contents =
xobject
and matrix =
match Pdf.lookup_direct pdf "/Matrix" xobject with
| Some (Pdf.Array [a; b; c; d; e; f]) ->
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
| _ -> Pdftransform.i_matrix
in
images := (pagenum, name, Form (matrix, contents, resources))::!images
| _ -> ()
)
xobjects
| _ -> ())
pdf
range;
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
let pagesplits =
map
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
and pages =
Pdfpage.pages_of_pagetree pdf
in
iter
(function (pagenum, images) ->
let page = select pagenum pages in
image_resolution_page pdf page pagenum dpi images)
pagesplits
let image_resolution pdf range dpi =
image_results := [];
image_resolution pdf range dpi;
rev !image_results

View File

@ -1,3 +1,5 @@
val extract_images : string -> val extract_images : string ->
string -> string ->
Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list