more
This commit is contained in:
parent
c711e3aa77
commit
9bdeccb343
2
Makefile
2
Makefile
|
@ -3,7 +3,7 @@ MODS = cpdfyojson cpdfxmlm \
|
||||||
cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
|
cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
|
||||||
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
|
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
|
||||||
cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \
|
cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \
|
||||||
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot \
|
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \
|
||||||
cpdfpagelabels cpdfcreate cpdfannot cpdfcommand
|
cpdfpagelabels cpdfcreate cpdfannot cpdfcommand
|
||||||
|
|
||||||
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
|
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
|
||||||
|
|
320
cpdf.ml
320
cpdf.ml
|
@ -1421,167 +1421,6 @@ let thinlines range width pdf =
|
||||||
in
|
in
|
||||||
Cpdfpage.process_pages (ppstub thinpage) pdf range
|
Cpdfpage.process_pages (ppstub thinpage) pdf range
|
||||||
|
|
||||||
|
|
||||||
(* \section{Making draft documents} *)
|
|
||||||
|
|
||||||
(* Predicate on an xobject: true if an image xobject. *)
|
|
||||||
let isimage pdf (_, xobj) =
|
|
||||||
match Pdf.lookup_direct pdf "/Subtype" xobj with
|
|
||||||
| Some (Pdf.Name "/Image") -> true
|
|
||||||
| _ -> false
|
|
||||||
|
|
||||||
(* Given a set of resources for a page, and the name of a resource, determine if
|
|
||||||
that name refers to an image xobject. *)
|
|
||||||
let xobject_isimage pdf resources name =
|
|
||||||
match resources with
|
|
||||||
| Pdf.Dictionary _ ->
|
|
||||||
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
|
||||||
| Some xobjects ->
|
|
||||||
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
|
|
||||||
| _ -> false
|
|
||||||
end
|
|
||||||
| _ -> failwith "bad resources"
|
|
||||||
|
|
||||||
(* The subsitute for an image. *)
|
|
||||||
let substitute boxes =
|
|
||||||
if boxes then
|
|
||||||
rev
|
|
||||||
[Pdfops.Op_q;
|
|
||||||
Pdfops.Op_w 0.;
|
|
||||||
Pdfops.Op_G 0.;
|
|
||||||
Pdfops.Op_re (0., 0., 1., 1.);
|
|
||||||
Pdfops.Op_m (0., 0.);
|
|
||||||
Pdfops.Op_l (1., 1.);
|
|
||||||
Pdfops.Op_m (0., 1.);
|
|
||||||
Pdfops.Op_l (1., 0.);
|
|
||||||
Pdfops.Op_S;
|
|
||||||
Pdfops.Op_Q]
|
|
||||||
else
|
|
||||||
[]
|
|
||||||
|
|
||||||
(* Remove references to images from a graphics stream. *)
|
|
||||||
let rec remove_images_stream onlyremove boxes pdf resources prev = function
|
|
||||||
| [] -> rev prev
|
|
||||||
| (Pdfops.Op_Do name) as h::t ->
|
|
||||||
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
|
|
||||||
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
|
||||||
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
|
||||||
| Pdfops.InlineImage _ as h::t ->
|
|
||||||
if onlyremove <> None
|
|
||||||
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
|
||||||
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
|
||||||
| h::t ->
|
|
||||||
remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
|
||||||
|
|
||||||
let rec process_form_xobject onlyremove boxes pdf form =
|
|
||||||
let form = Pdf.direct pdf form in
|
|
||||||
let page =
|
|
||||||
{Pdfpage.content = [form];
|
|
||||||
Pdfpage.mediabox = Pdf.Null;
|
|
||||||
Pdfpage.resources =
|
|
||||||
begin match Pdf.lookup_direct pdf "/Resources" form with
|
|
||||||
| Some r -> r
|
|
||||||
| None -> Pdf.Dictionary []
|
|
||||||
end;
|
|
||||||
Pdfpage.rotate = Pdfpage.Rotate0;
|
|
||||||
Pdfpage.rest = Pdf.Dictionary []}
|
|
||||||
in
|
|
||||||
let page', pdf =
|
|
||||||
remove_images_page onlyremove boxes pdf page
|
|
||||||
in
|
|
||||||
let form' =
|
|
||||||
match form with
|
|
||||||
| Pdf.Stream {contents = (dict, _)} ->
|
|
||||||
begin match
|
|
||||||
Pdfops.stream_of_ops
|
|
||||||
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
|
|
||||||
with
|
|
||||||
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
|
||||||
let dict' =
|
|
||||||
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
|
|
||||||
in
|
|
||||||
Pdf.Stream {contents = (dict', Pdf.Got data)}
|
|
||||||
| _ -> assert false
|
|
||||||
end
|
|
||||||
| _ -> raise (Pdf.PDFError "not a stream")
|
|
||||||
in
|
|
||||||
form', pdf
|
|
||||||
|
|
||||||
(* Remove images from a page. *)
|
|
||||||
and remove_images_page onlyremove boxes pdf page =
|
|
||||||
let isform pdf xobj =
|
|
||||||
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
|
|
||||||
in
|
|
||||||
let isimage pdf xobj =
|
|
||||||
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
|
|
||||||
in
|
|
||||||
(* Remove image xobjects and look into form ones *)
|
|
||||||
let form_xobjects, image_xobjects =
|
|
||||||
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
|
||||||
| Some (Pdf.Dictionary elts) ->
|
|
||||||
keep (function (_, p) -> isform pdf p) elts,
|
|
||||||
keep (function (_, p) -> isimage pdf p) elts
|
|
||||||
| _ -> [], []
|
|
||||||
in
|
|
||||||
let resources', pdf =
|
|
||||||
let names, pointers = split form_xobjects in
|
|
||||||
let form_xobjects', pdf =
|
|
||||||
let pdf = ref pdf
|
|
||||||
in let outputs = ref [] in
|
|
||||||
iter
|
|
||||||
(fun p ->
|
|
||||||
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
|
|
||||||
pdf := pdf';
|
|
||||||
outputs =| p')
|
|
||||||
pointers;
|
|
||||||
rev !outputs, !pdf
|
|
||||||
in
|
|
||||||
let nums = ref [] in
|
|
||||||
iter
|
|
||||||
(fun xobj ->
|
|
||||||
let objnum = Pdf.addobj pdf xobj in
|
|
||||||
nums =| objnum)
|
|
||||||
form_xobjects';
|
|
||||||
let image_xobjects' =
|
|
||||||
match onlyremove with
|
|
||||||
None -> []
|
|
||||||
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
|
|
||||||
in
|
|
||||||
let newdict =
|
|
||||||
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
|
|
||||||
in
|
|
||||||
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
|
|
||||||
in
|
|
||||||
let content' =
|
|
||||||
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
|
|
||||||
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
|
|
||||||
in
|
|
||||||
{page with
|
|
||||||
Pdfpage.content =
|
|
||||||
(let stream = Pdfops.stream_of_ops content' in
|
|
||||||
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
|
|
||||||
[stream]);
|
|
||||||
Pdfpage.resources = resources'}, pdf
|
|
||||||
|
|
||||||
(* Remove images from all pages in a document. *)
|
|
||||||
let draft onlyremove boxes range pdf =
|
|
||||||
let pages = Pdfpage.pages_of_pagetree pdf in
|
|
||||||
let pagenums = indx pages in
|
|
||||||
let pdf = ref pdf
|
|
||||||
in let pages' = ref [] in
|
|
||||||
iter2
|
|
||||||
(fun p pagenum ->
|
|
||||||
let p', pdf' =
|
|
||||||
if mem pagenum range
|
|
||||||
then remove_images_page onlyremove boxes !pdf p
|
|
||||||
else p, !pdf
|
|
||||||
in
|
|
||||||
pdf := pdf';
|
|
||||||
pages' =| p')
|
|
||||||
pages
|
|
||||||
pagenums;
|
|
||||||
Pdfpage.change_pages true !pdf (rev !pages')
|
|
||||||
|
|
||||||
(* Parse the new content to make sure syntactically ok, append
|
(* Parse the new content to make sure syntactically ok, append
|
||||||
* as required. Rewrite the content *)
|
* as required. Rewrite the content *)
|
||||||
let append_page_content_page fast s before pdf n page =
|
let append_page_content_page fast s before pdf n page =
|
||||||
|
@ -1775,154 +1614,7 @@ let remove_clipping pdf range =
|
||||||
{page with Pdfpage.content = content'}
|
{page with Pdfpage.content = content'}
|
||||||
in
|
in
|
||||||
Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range
|
Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range
|
||||||
|
|
||||||
(* Image resolution *)
|
|
||||||
type xobj =
|
|
||||||
| Image of int * int (* width, height *)
|
|
||||||
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
|
|
||||||
|
|
||||||
let image_results = ref []
|
|
||||||
|
|
||||||
let add_image_result i =
|
|
||||||
image_results := i::!image_results
|
|
||||||
|
|
||||||
(* Given a page and a list of (pagenum, name, thing) *)
|
|
||||||
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
|
|
||||||
try
|
|
||||||
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
|
||||||
and transform = ref [ref Pdftransform.i_matrix] in
|
|
||||||
iter
|
|
||||||
(function
|
|
||||||
| Pdfops.Op_cm matrix ->
|
|
||||||
begin match !transform with
|
|
||||||
| [] -> raise (Failure "no transform")
|
|
||||||
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
|
|
||||||
end
|
|
||||||
| Pdfops.Op_Do xobject ->
|
|
||||||
let trans (x, y) =
|
|
||||||
match !transform with
|
|
||||||
| [] -> raise (Failure "no transform")
|
|
||||||
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
|
|
||||||
in
|
|
||||||
let o = trans (0., 0.)
|
|
||||||
and x = trans (1., 0.)
|
|
||||||
and y = trans (0., 1.)
|
|
||||||
in
|
|
||||||
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
|
|
||||||
let rec lookup_image k = function
|
|
||||||
| [] -> assert false
|
|
||||||
| (_, a, _) as h::_ when a = k -> h
|
|
||||||
| _::t -> lookup_image k t
|
|
||||||
in
|
|
||||||
begin match lookup_image xobject images with
|
|
||||||
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
|
|
||||||
let content =
|
|
||||||
(* Add in matrix etc. *)
|
|
||||||
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
|
|
||||||
let ops =
|
|
||||||
Pdfops.Op_cm total_matrix::
|
|
||||||
Pdfops.parse_operators pdf resources [content]
|
|
||||||
in
|
|
||||||
Pdfops.stream_of_ops ops
|
|
||||||
in
|
|
||||||
let page =
|
|
||||||
{Pdfpage.content = [content];
|
|
||||||
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
|
|
||||||
Pdfpage.resources = resources;
|
|
||||||
Pdfpage.rotate = Pdfpage.Rotate0;
|
|
||||||
Pdfpage.rest = Pdf.Dictionary []}
|
|
||||||
in
|
|
||||||
let newpdf = Pdfpage.change_pages false pdf [page] in
|
|
||||||
image_resolution newpdf [pagenum] dpi
|
|
||||||
| (pagenum, name, Image (w, h)) ->
|
|
||||||
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
|
|
||||||
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
|
|
||||||
let wdpi = float w /. lx
|
|
||||||
and hdpi = float h /. ly in
|
|
||||||
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
|
|
||||||
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
|
|
||||||
(*i else
|
|
||||||
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
|
|
||||||
end
|
|
||||||
| Pdfops.Op_q ->
|
|
||||||
begin match !transform with
|
|
||||||
| [] -> raise (Failure "Unbalanced q/Q ops")
|
|
||||||
| h::t ->
|
|
||||||
let h' = ref Pdftransform.i_matrix in
|
|
||||||
h' := !h;
|
|
||||||
transform := h'::h::t
|
|
||||||
end
|
|
||||||
| Pdfops.Op_Q ->
|
|
||||||
begin match !transform with
|
|
||||||
| [] -> raise (Failure "Unbalanced q/Q ops")
|
|
||||||
| _ -> transform := tl !transform
|
|
||||||
end
|
|
||||||
| _ -> ())
|
|
||||||
pageops
|
|
||||||
with
|
|
||||||
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
|
|
||||||
|
|
||||||
and image_resolution pdf range dpi =
|
|
||||||
let images = ref [] in
|
|
||||||
Cpdfpage.iter_pages
|
|
||||||
(fun pagenum page ->
|
|
||||||
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
|
|
||||||
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
|
||||||
| Some (Pdf.Dictionary xobjects) ->
|
|
||||||
iter
|
|
||||||
(function (name, xobject) ->
|
|
||||||
match Pdf.lookup_direct pdf "/Subtype" xobject with
|
|
||||||
| Some (Pdf.Name "/Image") ->
|
|
||||||
let width =
|
|
||||||
match Pdf.lookup_direct pdf "/Width" xobject with
|
|
||||||
| Some x -> Pdf.getnum x
|
|
||||||
| None -> 1.
|
|
||||||
and height =
|
|
||||||
match Pdf.lookup_direct pdf "/Height" xobject with
|
|
||||||
| Some x -> Pdf.getnum x
|
|
||||||
| None -> 1.
|
|
||||||
in
|
|
||||||
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
|
|
||||||
| Some (Pdf.Name "/Form") ->
|
|
||||||
let resources =
|
|
||||||
match Pdf.lookup_direct pdf "/Resources" xobject with
|
|
||||||
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
|
|
||||||
| Some r -> r
|
|
||||||
and contents =
|
|
||||||
xobject
|
|
||||||
and matrix =
|
|
||||||
match Pdf.lookup_direct pdf "/Matrix" xobject with
|
|
||||||
| Some (Pdf.Array [a; b; c; d; e; f]) ->
|
|
||||||
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
|
|
||||||
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
|
|
||||||
| _ -> Pdftransform.i_matrix
|
|
||||||
in
|
|
||||||
images := (pagenum, name, Form (matrix, contents, resources))::!images
|
|
||||||
| _ -> ()
|
|
||||||
)
|
|
||||||
xobjects
|
|
||||||
| _ -> ())
|
|
||||||
pdf
|
|
||||||
range;
|
|
||||||
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
|
|
||||||
let pagesplits =
|
|
||||||
map
|
|
||||||
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
|
|
||||||
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
|
|
||||||
and pages =
|
|
||||||
Pdfpage.pages_of_pagetree pdf
|
|
||||||
in
|
|
||||||
iter
|
|
||||||
(function (pagenum, images) ->
|
|
||||||
let page = select pagenum pages in
|
|
||||||
image_resolution_page pdf page pagenum dpi images)
|
|
||||||
pagesplits
|
|
||||||
|
|
||||||
let image_resolution pdf range dpi =
|
|
||||||
image_results := [];
|
|
||||||
image_resolution pdf range dpi;
|
|
||||||
rev !image_results
|
|
||||||
|
|
||||||
(* copy the contents of the box f to the box t. If mediabox_if_missing is set,
|
(* copy the contents of the box f to the box t. If mediabox_if_missing is set,
|
||||||
the contents of the mediabox will be used if the from fox is not available. If
|
the contents of the mediabox will be used if the from fox is not available. If
|
||||||
mediabox_is_missing is false, the page is unaltered. *)
|
mediabox_is_missing is false, the page is unaltered. *)
|
||||||
|
@ -1964,13 +1656,3 @@ let remove_unused_resources_page pdf n page =
|
||||||
|
|
||||||
let remove_unused_resources pdf =
|
let remove_unused_resources pdf =
|
||||||
Cpdfpage.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
|
Cpdfpage.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
|
||||||
|
|
||||||
|
|
||||||
let create_pdf pages pagesize =
|
|
||||||
let page =
|
|
||||||
{(Pdfpage.blankpage pagesize) with
|
|
||||||
Pdfpage.content = [Pdfops.stream_of_ops []];
|
|
||||||
Pdfpage.resources = Pdf.Dictionary []}
|
|
||||||
in
|
|
||||||
let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
|
|
||||||
Pdfpage.add_root pageroot [] pdf
|
|
||||||
|
|
10
cpdf.mli
10
cpdf.mli
|
@ -113,9 +113,6 @@ val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
|
||||||
(** Make all fills on certain pages black. *)
|
(** Make all fills on certain pages black. *)
|
||||||
val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
|
val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
|
||||||
|
|
||||||
(** Remove images from a PDF, optionally adding crossed boxes. *)
|
|
||||||
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
|
|
||||||
|
|
||||||
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
||||||
|
|
||||||
(**/**)
|
(**/**)
|
||||||
|
@ -134,13 +131,6 @@ val print_dict_entry : Pdf.t -> string -> unit
|
||||||
|
|
||||||
val remove_clipping : Pdf.t -> int list -> Pdf.t
|
val remove_clipping : Pdf.t -> int list -> Pdf.t
|
||||||
|
|
||||||
val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list
|
|
||||||
|
|
||||||
val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t
|
val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t
|
||||||
|
|
||||||
|
|
||||||
val remove_unused_resources : Pdf.t -> Pdf.t
|
val remove_unused_resources : Pdf.t -> Pdf.t
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val create_pdf : int -> Pdfpaper.t -> Pdf.t
|
|
||||||
|
|
|
@ -3649,7 +3649,7 @@ let go () =
|
||||||
| Some Draft ->
|
| Some Draft ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
write_pdf false (Cpdf.draft args.removeonly args.boxes range pdf)
|
write_pdf false (Cpdfdraft.draft args.removeonly args.boxes range pdf)
|
||||||
| Some (AddText text) ->
|
| Some (AddText text) ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
|
@ -3778,7 +3778,7 @@ let go () =
|
||||||
| Some (ImageResolution f) ->
|
| Some (ImageResolution f) ->
|
||||||
let pdf = get_single_pdf args.op true in
|
let pdf = get_single_pdf args.op true in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
let images = Cpdf.image_resolution pdf range f in
|
let images = Cpdfimage.image_resolution pdf range f in
|
||||||
iter
|
iter
|
||||||
(function (pagenum, xobject, w, h, wdpi, hdpi) ->
|
(function (pagenum, xobject, w, h, wdpi, hdpi) ->
|
||||||
if wdpi < f || hdpi < f then
|
if wdpi < f || hdpi < f then
|
||||||
|
@ -3854,7 +3854,7 @@ let go () =
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
write_pdf false (Cpdfbookmarks.bookmarks_open_to_level n pdf)
|
write_pdf false (Cpdfbookmarks.bookmarks_open_to_level n pdf)
|
||||||
| Some CreatePDF ->
|
| Some CreatePDF ->
|
||||||
let pdf = Cpdf.create_pdf args.createpdf_pages args.createpdf_pagesize in
|
let pdf = Cpdfcreate.blank_document_paper args.createpdf_pagesize args.createpdf_pages in
|
||||||
write_pdf false pdf
|
write_pdf false pdf
|
||||||
| Some RemoveAllText ->
|
| Some RemoveAllText ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
|
|
|
@ -0,0 +1,163 @@
|
||||||
|
open Pdfutil
|
||||||
|
open Pdfio
|
||||||
|
|
||||||
|
(* \section{Making draft documents} *)
|
||||||
|
|
||||||
|
(* Predicate on an xobject: true if an image xobject. *)
|
||||||
|
let isimage pdf (_, xobj) =
|
||||||
|
match Pdf.lookup_direct pdf "/Subtype" xobj with
|
||||||
|
| Some (Pdf.Name "/Image") -> true
|
||||||
|
| _ -> false
|
||||||
|
|
||||||
|
(* Given a set of resources for a page, and the name of a resource, determine if
|
||||||
|
that name refers to an image xobject. *)
|
||||||
|
let xobject_isimage pdf resources name =
|
||||||
|
match resources with
|
||||||
|
| Pdf.Dictionary _ ->
|
||||||
|
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
||||||
|
| Some xobjects ->
|
||||||
|
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
|
||||||
|
| _ -> false
|
||||||
|
end
|
||||||
|
| _ -> failwith "bad resources"
|
||||||
|
|
||||||
|
(* The subsitute for an image. *)
|
||||||
|
let substitute boxes =
|
||||||
|
if boxes then
|
||||||
|
rev
|
||||||
|
[Pdfops.Op_q;
|
||||||
|
Pdfops.Op_w 0.;
|
||||||
|
Pdfops.Op_G 0.;
|
||||||
|
Pdfops.Op_re (0., 0., 1., 1.);
|
||||||
|
Pdfops.Op_m (0., 0.);
|
||||||
|
Pdfops.Op_l (1., 1.);
|
||||||
|
Pdfops.Op_m (0., 1.);
|
||||||
|
Pdfops.Op_l (1., 0.);
|
||||||
|
Pdfops.Op_S;
|
||||||
|
Pdfops.Op_Q]
|
||||||
|
else
|
||||||
|
[]
|
||||||
|
|
||||||
|
(* Remove references to images from a graphics stream. *)
|
||||||
|
let rec remove_images_stream onlyremove boxes pdf resources prev = function
|
||||||
|
| [] -> rev prev
|
||||||
|
| (Pdfops.Op_Do name) as h::t ->
|
||||||
|
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
|
||||||
|
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
||||||
|
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||||
|
| Pdfops.InlineImage _ as h::t ->
|
||||||
|
if onlyremove <> None
|
||||||
|
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||||
|
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
|
||||||
|
| h::t ->
|
||||||
|
remove_images_stream onlyremove boxes pdf resources (h::prev) t
|
||||||
|
|
||||||
|
let rec process_form_xobject onlyremove boxes pdf form =
|
||||||
|
let form = Pdf.direct pdf form in
|
||||||
|
let page =
|
||||||
|
{Pdfpage.content = [form];
|
||||||
|
Pdfpage.mediabox = Pdf.Null;
|
||||||
|
Pdfpage.resources =
|
||||||
|
begin match Pdf.lookup_direct pdf "/Resources" form with
|
||||||
|
| Some r -> r
|
||||||
|
| None -> Pdf.Dictionary []
|
||||||
|
end;
|
||||||
|
Pdfpage.rotate = Pdfpage.Rotate0;
|
||||||
|
Pdfpage.rest = Pdf.Dictionary []}
|
||||||
|
in
|
||||||
|
let page', pdf =
|
||||||
|
remove_images_page onlyremove boxes pdf page
|
||||||
|
in
|
||||||
|
let form' =
|
||||||
|
match form with
|
||||||
|
| Pdf.Stream {contents = (dict, _)} ->
|
||||||
|
begin match
|
||||||
|
Pdfops.stream_of_ops
|
||||||
|
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
|
||||||
|
with
|
||||||
|
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
||||||
|
let dict' =
|
||||||
|
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
|
||||||
|
in
|
||||||
|
Pdf.Stream {contents = (dict', Pdf.Got data)}
|
||||||
|
| _ -> assert false
|
||||||
|
end
|
||||||
|
| _ -> raise (Pdf.PDFError "not a stream")
|
||||||
|
in
|
||||||
|
form', pdf
|
||||||
|
|
||||||
|
(* Remove images from a page. *)
|
||||||
|
and remove_images_page onlyremove boxes pdf page =
|
||||||
|
let isform pdf xobj =
|
||||||
|
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
|
||||||
|
in
|
||||||
|
let isimage pdf xobj =
|
||||||
|
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
|
||||||
|
in
|
||||||
|
(* Remove image xobjects and look into form ones *)
|
||||||
|
let form_xobjects, image_xobjects =
|
||||||
|
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
||||||
|
| Some (Pdf.Dictionary elts) ->
|
||||||
|
keep (function (_, p) -> isform pdf p) elts,
|
||||||
|
keep (function (_, p) -> isimage pdf p) elts
|
||||||
|
| _ -> [], []
|
||||||
|
in
|
||||||
|
let resources', pdf =
|
||||||
|
let names, pointers = split form_xobjects in
|
||||||
|
let form_xobjects', pdf =
|
||||||
|
let pdf = ref pdf
|
||||||
|
in let outputs = ref [] in
|
||||||
|
iter
|
||||||
|
(fun p ->
|
||||||
|
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
|
||||||
|
pdf := pdf';
|
||||||
|
outputs =| p')
|
||||||
|
pointers;
|
||||||
|
rev !outputs, !pdf
|
||||||
|
in
|
||||||
|
let nums = ref [] in
|
||||||
|
iter
|
||||||
|
(fun xobj ->
|
||||||
|
let objnum = Pdf.addobj pdf xobj in
|
||||||
|
nums =| objnum)
|
||||||
|
form_xobjects';
|
||||||
|
let image_xobjects' =
|
||||||
|
match onlyremove with
|
||||||
|
None -> []
|
||||||
|
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
|
||||||
|
in
|
||||||
|
let newdict =
|
||||||
|
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
|
||||||
|
in
|
||||||
|
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
|
||||||
|
in
|
||||||
|
let content' =
|
||||||
|
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
|
||||||
|
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
|
||||||
|
in
|
||||||
|
{page with
|
||||||
|
Pdfpage.content =
|
||||||
|
(let stream = Pdfops.stream_of_ops content' in
|
||||||
|
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
|
||||||
|
[stream]);
|
||||||
|
Pdfpage.resources = resources'}, pdf
|
||||||
|
|
||||||
|
(* Remove images from all pages in a document. *)
|
||||||
|
let draft onlyremove boxes range pdf =
|
||||||
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
||||||
|
let pagenums = indx pages in
|
||||||
|
let pdf = ref pdf
|
||||||
|
in let pages' = ref [] in
|
||||||
|
iter2
|
||||||
|
(fun p pagenum ->
|
||||||
|
let p', pdf' =
|
||||||
|
if mem pagenum range
|
||||||
|
then remove_images_page onlyremove boxes !pdf p
|
||||||
|
else p, !pdf
|
||||||
|
in
|
||||||
|
pdf := pdf';
|
||||||
|
pages' =| p')
|
||||||
|
pages
|
||||||
|
pagenums;
|
||||||
|
Pdfpage.change_pages true !pdf (rev !pages')
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
(** Remove images from a PDF, optionally adding crossed boxes. *)
|
||||||
|
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
|
148
cpdfimage.ml
148
cpdfimage.ml
|
@ -125,3 +125,151 @@ let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf rang
|
||||||
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
|
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
|
||||||
pages
|
pages
|
||||||
(indx pages)
|
(indx pages)
|
||||||
|
|
||||||
|
(* Image resolution *)
|
||||||
|
type xobj =
|
||||||
|
| Image of int * int (* width, height *)
|
||||||
|
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
|
||||||
|
|
||||||
|
let image_results = ref []
|
||||||
|
|
||||||
|
let add_image_result i =
|
||||||
|
image_results := i::!image_results
|
||||||
|
|
||||||
|
(* Given a page and a list of (pagenum, name, thing) *)
|
||||||
|
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
|
||||||
|
try
|
||||||
|
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
||||||
|
and transform = ref [ref Pdftransform.i_matrix] in
|
||||||
|
iter
|
||||||
|
(function
|
||||||
|
| Pdfops.Op_cm matrix ->
|
||||||
|
begin match !transform with
|
||||||
|
| [] -> raise (Failure "no transform")
|
||||||
|
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
|
||||||
|
end
|
||||||
|
| Pdfops.Op_Do xobject ->
|
||||||
|
let trans (x, y) =
|
||||||
|
match !transform with
|
||||||
|
| [] -> raise (Failure "no transform")
|
||||||
|
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
|
||||||
|
in
|
||||||
|
let o = trans (0., 0.)
|
||||||
|
and x = trans (1., 0.)
|
||||||
|
and y = trans (0., 1.)
|
||||||
|
in
|
||||||
|
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
|
||||||
|
let rec lookup_image k = function
|
||||||
|
| [] -> assert false
|
||||||
|
| (_, a, _) as h::_ when a = k -> h
|
||||||
|
| _::t -> lookup_image k t
|
||||||
|
in
|
||||||
|
begin match lookup_image xobject images with
|
||||||
|
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
|
||||||
|
let content =
|
||||||
|
(* Add in matrix etc. *)
|
||||||
|
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
|
||||||
|
let ops =
|
||||||
|
Pdfops.Op_cm total_matrix::
|
||||||
|
Pdfops.parse_operators pdf resources [content]
|
||||||
|
in
|
||||||
|
Pdfops.stream_of_ops ops
|
||||||
|
in
|
||||||
|
let page =
|
||||||
|
{Pdfpage.content = [content];
|
||||||
|
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
|
||||||
|
Pdfpage.resources = resources;
|
||||||
|
Pdfpage.rotate = Pdfpage.Rotate0;
|
||||||
|
Pdfpage.rest = Pdf.Dictionary []}
|
||||||
|
in
|
||||||
|
let newpdf = Pdfpage.change_pages false pdf [page] in
|
||||||
|
image_resolution newpdf [pagenum] dpi
|
||||||
|
| (pagenum, name, Image (w, h)) ->
|
||||||
|
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
|
||||||
|
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
|
||||||
|
let wdpi = float w /. lx
|
||||||
|
and hdpi = float h /. ly in
|
||||||
|
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
|
||||||
|
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
|
||||||
|
(*i else
|
||||||
|
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
|
||||||
|
end
|
||||||
|
| Pdfops.Op_q ->
|
||||||
|
begin match !transform with
|
||||||
|
| [] -> raise (Failure "Unbalanced q/Q ops")
|
||||||
|
| h::t ->
|
||||||
|
let h' = ref Pdftransform.i_matrix in
|
||||||
|
h' := !h;
|
||||||
|
transform := h'::h::t
|
||||||
|
end
|
||||||
|
| Pdfops.Op_Q ->
|
||||||
|
begin match !transform with
|
||||||
|
| [] -> raise (Failure "Unbalanced q/Q ops")
|
||||||
|
| _ -> transform := tl !transform
|
||||||
|
end
|
||||||
|
| _ -> ())
|
||||||
|
pageops
|
||||||
|
with
|
||||||
|
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
|
||||||
|
|
||||||
|
and image_resolution pdf range dpi =
|
||||||
|
let images = ref [] in
|
||||||
|
Cpdfpage.iter_pages
|
||||||
|
(fun pagenum page ->
|
||||||
|
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
|
||||||
|
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
||||||
|
| Some (Pdf.Dictionary xobjects) ->
|
||||||
|
iter
|
||||||
|
(function (name, xobject) ->
|
||||||
|
match Pdf.lookup_direct pdf "/Subtype" xobject with
|
||||||
|
| Some (Pdf.Name "/Image") ->
|
||||||
|
let width =
|
||||||
|
match Pdf.lookup_direct pdf "/Width" xobject with
|
||||||
|
| Some x -> Pdf.getnum x
|
||||||
|
| None -> 1.
|
||||||
|
and height =
|
||||||
|
match Pdf.lookup_direct pdf "/Height" xobject with
|
||||||
|
| Some x -> Pdf.getnum x
|
||||||
|
| None -> 1.
|
||||||
|
in
|
||||||
|
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
|
||||||
|
| Some (Pdf.Name "/Form") ->
|
||||||
|
let resources =
|
||||||
|
match Pdf.lookup_direct pdf "/Resources" xobject with
|
||||||
|
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
|
||||||
|
| Some r -> r
|
||||||
|
and contents =
|
||||||
|
xobject
|
||||||
|
and matrix =
|
||||||
|
match Pdf.lookup_direct pdf "/Matrix" xobject with
|
||||||
|
| Some (Pdf.Array [a; b; c; d; e; f]) ->
|
||||||
|
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
|
||||||
|
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
|
||||||
|
| _ -> Pdftransform.i_matrix
|
||||||
|
in
|
||||||
|
images := (pagenum, name, Form (matrix, contents, resources))::!images
|
||||||
|
| _ -> ()
|
||||||
|
)
|
||||||
|
xobjects
|
||||||
|
| _ -> ())
|
||||||
|
pdf
|
||||||
|
range;
|
||||||
|
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
|
||||||
|
let pagesplits =
|
||||||
|
map
|
||||||
|
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
|
||||||
|
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
|
||||||
|
and pages =
|
||||||
|
Pdfpage.pages_of_pagetree pdf
|
||||||
|
in
|
||||||
|
iter
|
||||||
|
(function (pagenum, images) ->
|
||||||
|
let page = select pagenum pages in
|
||||||
|
image_resolution_page pdf page pagenum dpi images)
|
||||||
|
pagesplits
|
||||||
|
|
||||||
|
let image_resolution pdf range dpi =
|
||||||
|
image_results := [];
|
||||||
|
image_resolution pdf range dpi;
|
||||||
|
rev !image_results
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
val extract_images : string ->
|
val extract_images : string ->
|
||||||
string ->
|
string ->
|
||||||
Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
|
Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
|
||||||
|
|
||||||
|
val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list
|
||||||
|
|
Loading…
Reference in New Issue