more
This commit is contained in:
parent
c711e3aa77
commit
9bdeccb343
2
Makefile
2
Makefile
|
@ -3,7 +3,7 @@ MODS = cpdfyojson cpdfxmlm \
|
|||
cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
|
||||
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
|
||||
cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \
|
||||
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot \
|
||||
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \
|
||||
cpdfpagelabels cpdfcreate cpdfannot cpdfcommand
|
||||
|
||||
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
|
||||
|
|
318
cpdf.ml
318
cpdf.ml
|
@ -1421,167 +1421,6 @@ let thinlines range width pdf =
|
|||
in
|
||||
Cpdfpage.process_pages (ppstub thinpage) pdf range
|
||||
|
||||
|
||||
(* \section{Making draft documents} *)
|
||||
|
||||
(* Predicate on an xobject: true if an image xobject. *)
|
||||
let isimage pdf (_, xobj) =
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobj with
|
||||
| Some (Pdf.Name "/Image") -> true
|
||||
| _ -> false
|
||||
|
||||
(* Given a set of resources for a page, and the name of a resource, determine if
|
||||
that name refers to an image xobject. *)
|
||||
let xobject_isimage pdf resources name =
|
||||
match resources with
|
||||
| Pdf.Dictionary _ ->
|
||||
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
||||
| Some xobjects ->
|
||||
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
|
||||
| _ -> false
|
||||
end
|
||||
| _ -> failwith "bad resources"
|
||||
|
||||
(* The subsitute for an image. *)
|
||||
let substitute boxes =
|
||||
if boxes then
|
||||
rev
|
||||
[Pdfops.Op_q;
|
||||
Pdfops.Op_w 0.;
|
||||
Pdfops.Op_G 0.;
|
||||
Pdfops.Op_re (0., 0., 1., 1.);
|
||||
Pdfops.Op_m (0., 0.);
|
||||
Pdfops.Op_l (1., 1.);
|
||||
Pdfops.Op_m (0., 1.);
|
||||
Pdfops.Op_l (1., 0.);
|
||||
Pdfops.Op_S;
|
||||
Pdfops.Op_Q]
|
||||
else
|
||||
[]
|
||||
|
||||
(* Remove references to images from a graphics stream. *)
|
||||
let rec remove_images_stream onlyremove boxes pdf resources prev = function
|
||||
| [] -> rev prev
|
||||
| (Pdfops.Op_Do name) as h::t ->
|
||||
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
|
||||
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
||||
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||
| Pdfops.InlineImage _ as h::t ->
|
||||
if onlyremove <> None
|
||||
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
||||
| h::t ->
|
||||
remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||
|
||||
let rec process_form_xobject onlyremove boxes pdf form =
|
||||
let form = Pdf.direct pdf form in
|
||||
let page =
|
||||
{Pdfpage.content = [form];
|
||||
Pdfpage.mediabox = Pdf.Null;
|
||||
Pdfpage.resources =
|
||||
begin match Pdf.lookup_direct pdf "/Resources" form with
|
||||
| Some r -> r
|
||||
| None -> Pdf.Dictionary []
|
||||
end;
|
||||
Pdfpage.rotate = Pdfpage.Rotate0;
|
||||
Pdfpage.rest = Pdf.Dictionary []}
|
||||
in
|
||||
let page', pdf =
|
||||
remove_images_page onlyremove boxes pdf page
|
||||
in
|
||||
let form' =
|
||||
match form with
|
||||
| Pdf.Stream {contents = (dict, _)} ->
|
||||
begin match
|
||||
Pdfops.stream_of_ops
|
||||
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
|
||||
with
|
||||
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
||||
let dict' =
|
||||
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
|
||||
in
|
||||
Pdf.Stream {contents = (dict', Pdf.Got data)}
|
||||
| _ -> assert false
|
||||
end
|
||||
| _ -> raise (Pdf.PDFError "not a stream")
|
||||
in
|
||||
form', pdf
|
||||
|
||||
(* Remove images from a page. *)
|
||||
and remove_images_page onlyremove boxes pdf page =
|
||||
let isform pdf xobj =
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
|
||||
in
|
||||
let isimage pdf xobj =
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
|
||||
in
|
||||
(* Remove image xobjects and look into form ones *)
|
||||
let form_xobjects, image_xobjects =
|
||||
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
||||
| Some (Pdf.Dictionary elts) ->
|
||||
keep (function (_, p) -> isform pdf p) elts,
|
||||
keep (function (_, p) -> isimage pdf p) elts
|
||||
| _ -> [], []
|
||||
in
|
||||
let resources', pdf =
|
||||
let names, pointers = split form_xobjects in
|
||||
let form_xobjects', pdf =
|
||||
let pdf = ref pdf
|
||||
in let outputs = ref [] in
|
||||
iter
|
||||
(fun p ->
|
||||
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
|
||||
pdf := pdf';
|
||||
outputs =| p')
|
||||
pointers;
|
||||
rev !outputs, !pdf
|
||||
in
|
||||
let nums = ref [] in
|
||||
iter
|
||||
(fun xobj ->
|
||||
let objnum = Pdf.addobj pdf xobj in
|
||||
nums =| objnum)
|
||||
form_xobjects';
|
||||
let image_xobjects' =
|
||||
match onlyremove with
|
||||
None -> []
|
||||
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
|
||||
in
|
||||
let newdict =
|
||||
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
|
||||
in
|
||||
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
|
||||
in
|
||||
let content' =
|
||||
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
|
||||
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
|
||||
in
|
||||
{page with
|
||||
Pdfpage.content =
|
||||
(let stream = Pdfops.stream_of_ops content' in
|
||||
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
|
||||
[stream]);
|
||||
Pdfpage.resources = resources'}, pdf
|
||||
|
||||
(* Remove images from all pages in a document. *)
|
||||
let draft onlyremove boxes range pdf =
|
||||
let pages = Pdfpage.pages_of_pagetree pdf in
|
||||
let pagenums = indx pages in
|
||||
let pdf = ref pdf
|
||||
in let pages' = ref [] in
|
||||
iter2
|
||||
(fun p pagenum ->
|
||||
let p', pdf' =
|
||||
if mem pagenum range
|
||||
then remove_images_page onlyremove boxes !pdf p
|
||||
else p, !pdf
|
||||
in
|
||||
pdf := pdf';
|
||||
pages' =| p')
|
||||
pages
|
||||
pagenums;
|
||||
Pdfpage.change_pages true !pdf (rev !pages')
|
||||
|
||||
(* Parse the new content to make sure syntactically ok, append
|
||||
* as required. Rewrite the content *)
|
||||
let append_page_content_page fast s before pdf n page =
|
||||
|
@ -1776,153 +1615,6 @@ let remove_clipping pdf range =
|
|||
in
|
||||
Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range
|
||||
|
||||
(* Image resolution *)
|
||||
type xobj =
|
||||
| Image of int * int (* width, height *)
|
||||
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
|
||||
|
||||
let image_results = ref []
|
||||
|
||||
let add_image_result i =
|
||||
image_results := i::!image_results
|
||||
|
||||
(* Given a page and a list of (pagenum, name, thing) *)
|
||||
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
|
||||
try
|
||||
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
||||
and transform = ref [ref Pdftransform.i_matrix] in
|
||||
iter
|
||||
(function
|
||||
| Pdfops.Op_cm matrix ->
|
||||
begin match !transform with
|
||||
| [] -> raise (Failure "no transform")
|
||||
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
|
||||
end
|
||||
| Pdfops.Op_Do xobject ->
|
||||
let trans (x, y) =
|
||||
match !transform with
|
||||
| [] -> raise (Failure "no transform")
|
||||
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
|
||||
in
|
||||
let o = trans (0., 0.)
|
||||
and x = trans (1., 0.)
|
||||
and y = trans (0., 1.)
|
||||
in
|
||||
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
|
||||
let rec lookup_image k = function
|
||||
| [] -> assert false
|
||||
| (_, a, _) as h::_ when a = k -> h
|
||||
| _::t -> lookup_image k t
|
||||
in
|
||||
begin match lookup_image xobject images with
|
||||
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
|
||||
let content =
|
||||
(* Add in matrix etc. *)
|
||||
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
|
||||
let ops =
|
||||
Pdfops.Op_cm total_matrix::
|
||||
Pdfops.parse_operators pdf resources [content]
|
||||
in
|
||||
Pdfops.stream_of_ops ops
|
||||
in
|
||||
let page =
|
||||
{Pdfpage.content = [content];
|
||||
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
|
||||
Pdfpage.resources = resources;
|
||||
Pdfpage.rotate = Pdfpage.Rotate0;
|
||||
Pdfpage.rest = Pdf.Dictionary []}
|
||||
in
|
||||
let newpdf = Pdfpage.change_pages false pdf [page] in
|
||||
image_resolution newpdf [pagenum] dpi
|
||||
| (pagenum, name, Image (w, h)) ->
|
||||
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
|
||||
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
|
||||
let wdpi = float w /. lx
|
||||
and hdpi = float h /. ly in
|
||||
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
|
||||
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
|
||||
(*i else
|
||||
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
|
||||
end
|
||||
| Pdfops.Op_q ->
|
||||
begin match !transform with
|
||||
| [] -> raise (Failure "Unbalanced q/Q ops")
|
||||
| h::t ->
|
||||
let h' = ref Pdftransform.i_matrix in
|
||||
h' := !h;
|
||||
transform := h'::h::t
|
||||
end
|
||||
| Pdfops.Op_Q ->
|
||||
begin match !transform with
|
||||
| [] -> raise (Failure "Unbalanced q/Q ops")
|
||||
| _ -> transform := tl !transform
|
||||
end
|
||||
| _ -> ())
|
||||
pageops
|
||||
with
|
||||
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
|
||||
|
||||
and image_resolution pdf range dpi =
|
||||
let images = ref [] in
|
||||
Cpdfpage.iter_pages
|
||||
(fun pagenum page ->
|
||||
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
|
||||
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
||||
| Some (Pdf.Dictionary xobjects) ->
|
||||
iter
|
||||
(function (name, xobject) ->
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobject with
|
||||
| Some (Pdf.Name "/Image") ->
|
||||
let width =
|
||||
match Pdf.lookup_direct pdf "/Width" xobject with
|
||||
| Some x -> Pdf.getnum x
|
||||
| None -> 1.
|
||||
and height =
|
||||
match Pdf.lookup_direct pdf "/Height" xobject with
|
||||
| Some x -> Pdf.getnum x
|
||||
| None -> 1.
|
||||
in
|
||||
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
|
||||
| Some (Pdf.Name "/Form") ->
|
||||
let resources =
|
||||
match Pdf.lookup_direct pdf "/Resources" xobject with
|
||||
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
|
||||
| Some r -> r
|
||||
and contents =
|
||||
xobject
|
||||
and matrix =
|
||||
match Pdf.lookup_direct pdf "/Matrix" xobject with
|
||||
| Some (Pdf.Array [a; b; c; d; e; f]) ->
|
||||
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
|
||||
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
|
||||
| _ -> Pdftransform.i_matrix
|
||||
in
|
||||
images := (pagenum, name, Form (matrix, contents, resources))::!images
|
||||
| _ -> ()
|
||||
)
|
||||
xobjects
|
||||
| _ -> ())
|
||||
pdf
|
||||
range;
|
||||
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
|
||||
let pagesplits =
|
||||
map
|
||||
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
|
||||
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
|
||||
and pages =
|
||||
Pdfpage.pages_of_pagetree pdf
|
||||
in
|
||||
iter
|
||||
(function (pagenum, images) ->
|
||||
let page = select pagenum pages in
|
||||
image_resolution_page pdf page pagenum dpi images)
|
||||
pagesplits
|
||||
|
||||
let image_resolution pdf range dpi =
|
||||
image_results := [];
|
||||
image_resolution pdf range dpi;
|
||||
rev !image_results
|
||||
|
||||
(* copy the contents of the box f to the box t. If mediabox_if_missing is set,
|
||||
the contents of the mediabox will be used if the from fox is not available. If
|
||||
mediabox_is_missing is false, the page is unaltered. *)
|
||||
|
@ -1964,13 +1656,3 @@ let remove_unused_resources_page pdf n page =
|
|||
|
||||
let remove_unused_resources pdf =
|
||||
Cpdfpage.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
|
||||
|
||||
|
||||
let create_pdf pages pagesize =
|
||||
let page =
|
||||
{(Pdfpage.blankpage pagesize) with
|
||||
Pdfpage.content = [Pdfops.stream_of_ops []];
|
||||
Pdfpage.resources = Pdf.Dictionary []}
|
||||
in
|
||||
let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
|
||||
Pdfpage.add_root pageroot [] pdf
|
||||
|
|
10
cpdf.mli
10
cpdf.mli
|
@ -113,9 +113,6 @@ val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
|
|||
(** Make all fills on certain pages black. *)
|
||||
val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
|
||||
|
||||
(** Remove images from a PDF, optionally adding crossed boxes. *)
|
||||
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
|
||||
|
||||
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
||||
|
||||
(**/**)
|
||||
|
@ -134,13 +131,6 @@ val print_dict_entry : Pdf.t -> string -> unit
|
|||
|
||||
val remove_clipping : Pdf.t -> int list -> Pdf.t
|
||||
|
||||
val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list
|
||||
|
||||
val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t
|
||||
|
||||
|
||||
val remove_unused_resources : Pdf.t -> Pdf.t
|
||||
|
||||
|
||||
|
||||
val create_pdf : int -> Pdfpaper.t -> Pdf.t
|
||||
|
|
|
@ -3649,7 +3649,7 @@ let go () =
|
|||
| Some Draft ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
write_pdf false (Cpdf.draft args.removeonly args.boxes range pdf)
|
||||
write_pdf false (Cpdfdraft.draft args.removeonly args.boxes range pdf)
|
||||
| Some (AddText text) ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
|
@ -3778,7 +3778,7 @@ let go () =
|
|||
| Some (ImageResolution f) ->
|
||||
let pdf = get_single_pdf args.op true in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
let images = Cpdf.image_resolution pdf range f in
|
||||
let images = Cpdfimage.image_resolution pdf range f in
|
||||
iter
|
||||
(function (pagenum, xobject, w, h, wdpi, hdpi) ->
|
||||
if wdpi < f || hdpi < f then
|
||||
|
@ -3854,7 +3854,7 @@ let go () =
|
|||
let pdf = get_single_pdf args.op false in
|
||||
write_pdf false (Cpdfbookmarks.bookmarks_open_to_level n pdf)
|
||||
| Some CreatePDF ->
|
||||
let pdf = Cpdf.create_pdf args.createpdf_pages args.createpdf_pagesize in
|
||||
let pdf = Cpdfcreate.blank_document_paper args.createpdf_pagesize args.createpdf_pages in
|
||||
write_pdf false pdf
|
||||
| Some RemoveAllText ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
open Pdfutil
|
||||
open Pdfio
|
||||
|
||||
(* \section{Making draft documents} *)
|
||||
|
||||
(* Predicate on an xobject: true if an image xobject. *)
|
||||
let isimage pdf (_, xobj) =
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobj with
|
||||
| Some (Pdf.Name "/Image") -> true
|
||||
| _ -> false
|
||||
|
||||
(* Given a set of resources for a page, and the name of a resource, determine if
|
||||
that name refers to an image xobject. *)
|
||||
let xobject_isimage pdf resources name =
|
||||
match resources with
|
||||
| Pdf.Dictionary _ ->
|
||||
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
||||
| Some xobjects ->
|
||||
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
|
||||
| _ -> false
|
||||
end
|
||||
| _ -> failwith "bad resources"
|
||||
|
||||
(* The subsitute for an image. *)
|
||||
let substitute boxes =
|
||||
if boxes then
|
||||
rev
|
||||
[Pdfops.Op_q;
|
||||
Pdfops.Op_w 0.;
|
||||
Pdfops.Op_G 0.;
|
||||
Pdfops.Op_re (0., 0., 1., 1.);
|
||||
Pdfops.Op_m (0., 0.);
|
||||
Pdfops.Op_l (1., 1.);
|
||||
Pdfops.Op_m (0., 1.);
|
||||
Pdfops.Op_l (1., 0.);
|
||||
Pdfops.Op_S;
|
||||
Pdfops.Op_Q]
|
||||
else
|
||||
[]
|
||||
|
||||
(* Remove references to images from a graphics stream. *)
|
||||
let rec remove_images_stream onlyremove boxes pdf resources prev = function
|
||||
| [] -> rev prev
|
||||
| (Pdfops.Op_Do name) as h::t ->
|
||||
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
|
||||
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
||||
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||
| Pdfops.InlineImage _ as h::t ->
|
||||
if onlyremove <> None
|
||||
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
||||
| h::t ->
|
||||
remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||
|
||||
let rec process_form_xobject onlyremove boxes pdf form =
|
||||
let form = Pdf.direct pdf form in
|
||||
let page =
|
||||
{Pdfpage.content = [form];
|
||||
Pdfpage.mediabox = Pdf.Null;
|
||||
Pdfpage.resources =
|
||||
begin match Pdf.lookup_direct pdf "/Resources" form with
|
||||
| Some r -> r
|
||||
| None -> Pdf.Dictionary []
|
||||
end;
|
||||
Pdfpage.rotate = Pdfpage.Rotate0;
|
||||
Pdfpage.rest = Pdf.Dictionary []}
|
||||
in
|
||||
let page', pdf =
|
||||
remove_images_page onlyremove boxes pdf page
|
||||
in
|
||||
let form' =
|
||||
match form with
|
||||
| Pdf.Stream {contents = (dict, _)} ->
|
||||
begin match
|
||||
Pdfops.stream_of_ops
|
||||
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
|
||||
with
|
||||
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
||||
let dict' =
|
||||
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
|
||||
in
|
||||
Pdf.Stream {contents = (dict', Pdf.Got data)}
|
||||
| _ -> assert false
|
||||
end
|
||||
| _ -> raise (Pdf.PDFError "not a stream")
|
||||
in
|
||||
form', pdf
|
||||
|
||||
(* Remove images from a page. *)
|
||||
and remove_images_page onlyremove boxes pdf page =
|
||||
let isform pdf xobj =
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
|
||||
in
|
||||
let isimage pdf xobj =
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
|
||||
in
|
||||
(* Remove image xobjects and look into form ones *)
|
||||
let form_xobjects, image_xobjects =
|
||||
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
||||
| Some (Pdf.Dictionary elts) ->
|
||||
keep (function (_, p) -> isform pdf p) elts,
|
||||
keep (function (_, p) -> isimage pdf p) elts
|
||||
| _ -> [], []
|
||||
in
|
||||
let resources', pdf =
|
||||
let names, pointers = split form_xobjects in
|
||||
let form_xobjects', pdf =
|
||||
let pdf = ref pdf
|
||||
in let outputs = ref [] in
|
||||
iter
|
||||
(fun p ->
|
||||
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
|
||||
pdf := pdf';
|
||||
outputs =| p')
|
||||
pointers;
|
||||
rev !outputs, !pdf
|
||||
in
|
||||
let nums = ref [] in
|
||||
iter
|
||||
(fun xobj ->
|
||||
let objnum = Pdf.addobj pdf xobj in
|
||||
nums =| objnum)
|
||||
form_xobjects';
|
||||
let image_xobjects' =
|
||||
match onlyremove with
|
||||
None -> []
|
||||
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
|
||||
in
|
||||
let newdict =
|
||||
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
|
||||
in
|
||||
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
|
||||
in
|
||||
let content' =
|
||||
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
|
||||
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
|
||||
in
|
||||
{page with
|
||||
Pdfpage.content =
|
||||
(let stream = Pdfops.stream_of_ops content' in
|
||||
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
|
||||
[stream]);
|
||||
Pdfpage.resources = resources'}, pdf
|
||||
|
||||
(* Remove images from all pages in a document. *)
|
||||
let draft onlyremove boxes range pdf =
|
||||
let pages = Pdfpage.pages_of_pagetree pdf in
|
||||
let pagenums = indx pages in
|
||||
let pdf = ref pdf
|
||||
in let pages' = ref [] in
|
||||
iter2
|
||||
(fun p pagenum ->
|
||||
let p', pdf' =
|
||||
if mem pagenum range
|
||||
then remove_images_page onlyremove boxes !pdf p
|
||||
else p, !pdf
|
||||
in
|
||||
pdf := pdf';
|
||||
pages' =| p')
|
||||
pages
|
||||
pagenums;
|
||||
Pdfpage.change_pages true !pdf (rev !pages')
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
(** Remove images from a PDF, optionally adding crossed boxes. *)
|
||||
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
|
148
cpdfimage.ml
148
cpdfimage.ml
|
@ -125,3 +125,151 @@ let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf rang
|
|||
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
|
||||
pages
|
||||
(indx pages)
|
||||
|
||||
(* Image resolution *)
|
||||
type xobj =
|
||||
| Image of int * int (* width, height *)
|
||||
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
|
||||
|
||||
let image_results = ref []
|
||||
|
||||
let add_image_result i =
|
||||
image_results := i::!image_results
|
||||
|
||||
(* Given a page and a list of (pagenum, name, thing) *)
|
||||
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
|
||||
try
|
||||
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
||||
and transform = ref [ref Pdftransform.i_matrix] in
|
||||
iter
|
||||
(function
|
||||
| Pdfops.Op_cm matrix ->
|
||||
begin match !transform with
|
||||
| [] -> raise (Failure "no transform")
|
||||
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
|
||||
end
|
||||
| Pdfops.Op_Do xobject ->
|
||||
let trans (x, y) =
|
||||
match !transform with
|
||||
| [] -> raise (Failure "no transform")
|
||||
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
|
||||
in
|
||||
let o = trans (0., 0.)
|
||||
and x = trans (1., 0.)
|
||||
and y = trans (0., 1.)
|
||||
in
|
||||
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
|
||||
let rec lookup_image k = function
|
||||
| [] -> assert false
|
||||
| (_, a, _) as h::_ when a = k -> h
|
||||
| _::t -> lookup_image k t
|
||||
in
|
||||
begin match lookup_image xobject images with
|
||||
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
|
||||
let content =
|
||||
(* Add in matrix etc. *)
|
||||
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
|
||||
let ops =
|
||||
Pdfops.Op_cm total_matrix::
|
||||
Pdfops.parse_operators pdf resources [content]
|
||||
in
|
||||
Pdfops.stream_of_ops ops
|
||||
in
|
||||
let page =
|
||||
{Pdfpage.content = [content];
|
||||
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
|
||||
Pdfpage.resources = resources;
|
||||
Pdfpage.rotate = Pdfpage.Rotate0;
|
||||
Pdfpage.rest = Pdf.Dictionary []}
|
||||
in
|
||||
let newpdf = Pdfpage.change_pages false pdf [page] in
|
||||
image_resolution newpdf [pagenum] dpi
|
||||
| (pagenum, name, Image (w, h)) ->
|
||||
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
|
||||
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
|
||||
let wdpi = float w /. lx
|
||||
and hdpi = float h /. ly in
|
||||
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
|
||||
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
|
||||
(*i else
|
||||
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
|
||||
end
|
||||
| Pdfops.Op_q ->
|
||||
begin match !transform with
|
||||
| [] -> raise (Failure "Unbalanced q/Q ops")
|
||||
| h::t ->
|
||||
let h' = ref Pdftransform.i_matrix in
|
||||
h' := !h;
|
||||
transform := h'::h::t
|
||||
end
|
||||
| Pdfops.Op_Q ->
|
||||
begin match !transform with
|
||||
| [] -> raise (Failure "Unbalanced q/Q ops")
|
||||
| _ -> transform := tl !transform
|
||||
end
|
||||
| _ -> ())
|
||||
pageops
|
||||
with
|
||||
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
|
||||
|
||||
and image_resolution pdf range dpi =
|
||||
let images = ref [] in
|
||||
Cpdfpage.iter_pages
|
||||
(fun pagenum page ->
|
||||
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
|
||||
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
||||
| Some (Pdf.Dictionary xobjects) ->
|
||||
iter
|
||||
(function (name, xobject) ->
|
||||
match Pdf.lookup_direct pdf "/Subtype" xobject with
|
||||
| Some (Pdf.Name "/Image") ->
|
||||
let width =
|
||||
match Pdf.lookup_direct pdf "/Width" xobject with
|
||||
| Some x -> Pdf.getnum x
|
||||
| None -> 1.
|
||||
and height =
|
||||
match Pdf.lookup_direct pdf "/Height" xobject with
|
||||
| Some x -> Pdf.getnum x
|
||||
| None -> 1.
|
||||
in
|
||||
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
|
||||
| Some (Pdf.Name "/Form") ->
|
||||
let resources =
|
||||
match Pdf.lookup_direct pdf "/Resources" xobject with
|
||||
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
|
||||
| Some r -> r
|
||||
and contents =
|
||||
xobject
|
||||
and matrix =
|
||||
match Pdf.lookup_direct pdf "/Matrix" xobject with
|
||||
| Some (Pdf.Array [a; b; c; d; e; f]) ->
|
||||
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
|
||||
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
|
||||
| _ -> Pdftransform.i_matrix
|
||||
in
|
||||
images := (pagenum, name, Form (matrix, contents, resources))::!images
|
||||
| _ -> ()
|
||||
)
|
||||
xobjects
|
||||
| _ -> ())
|
||||
pdf
|
||||
range;
|
||||
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
|
||||
let pagesplits =
|
||||
map
|
||||
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
|
||||
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
|
||||
and pages =
|
||||
Pdfpage.pages_of_pagetree pdf
|
||||
in
|
||||
iter
|
||||
(function (pagenum, images) ->
|
||||
let page = select pagenum pages in
|
||||
image_resolution_page pdf page pagenum dpi images)
|
||||
pagesplits
|
||||
|
||||
let image_resolution pdf range dpi =
|
||||
image_results := [];
|
||||
image_resolution pdf range dpi;
|
||||
rev !image_results
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
val extract_images : string ->
|
||||
string ->
|
||||
Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
|
||||
|
||||
val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list
|
||||
|
|
Loading…
Reference in New Issue