cpdf-source/cpdfdraft.ml

164 lines
5.6 KiB
OCaml
Raw Permalink Normal View History

2021-12-21 16:25:59 +01:00
open Pdfutil
open Pdfio
2021-12-31 20:33:34 +01:00
(* Making draft documents *)
2021-12-21 16:25:59 +01:00
(* Predicate on an xobject: true if an image xobject. *)
let isimage pdf (_, xobj) =
match Pdf.lookup_direct pdf "/Subtype" xobj with
| Some (Pdf.Name "/Image") -> true
| _ -> false
(* Given a set of resources for a page, and the name of a resource, determine if
that name refers to an image xobject. *)
let xobject_isimage pdf resources name =
match resources with
| Pdf.Dictionary _ ->
begin match Pdf.lookup_direct pdf "/XObject" resources with
| Some xobjects ->
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
| _ -> false
end
| _ -> failwith "bad resources"
(* The subsitute for an image. *)
let substitute boxes =
if boxes then
rev
[Pdfops.Op_q;
Pdfops.Op_w 0.;
Pdfops.Op_G 0.;
Pdfops.Op_re (0., 0., 1., 1.);
Pdfops.Op_m (0., 0.);
Pdfops.Op_l (1., 1.);
Pdfops.Op_m (0., 1.);
Pdfops.Op_l (1., 0.);
Pdfops.Op_S;
Pdfops.Op_Q]
else
[]
(* Remove references to images from a graphics stream. *)
let rec remove_images_stream onlyremove boxes pdf resources prev = function
| [] -> rev prev
| (Pdfops.Op_Do name) as h::t ->
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
| Pdfops.InlineImage _ as h::t ->
if onlyremove <> None
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
| h::t ->
remove_images_stream onlyremove boxes pdf resources (h::prev) t
let rec process_form_xobject onlyremove boxes pdf form =
let form = Pdf.direct pdf form in
let page =
{Pdfpage.content = [form];
Pdfpage.mediabox = Pdf.Null;
Pdfpage.resources =
begin match Pdf.lookup_direct pdf "/Resources" form with
| Some r -> r
| None -> Pdf.Dictionary []
end;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let page', pdf =
remove_images_page onlyremove boxes pdf page
in
let form' =
match form with
| Pdf.Stream {contents = (dict, _)} ->
begin match
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
with
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
let dict' =
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
in
Pdf.Stream {contents = (dict', Pdf.Got data)}
| _ -> assert false
end
| _ -> raise (Pdf.PDFError "not a stream")
in
form', pdf
(* Remove images from a page. *)
and remove_images_page onlyremove boxes pdf page =
let isform pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
in
let isimage pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
in
(* Remove image xobjects and look into form ones *)
let form_xobjects, image_xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
keep (function (_, p) -> isform pdf p) elts,
keep (function (_, p) -> isimage pdf p) elts
| _ -> [], []
in
let resources', pdf =
let names, pointers = split form_xobjects in
let form_xobjects', pdf =
let pdf = ref pdf
in let outputs = ref [] in
iter
(fun p ->
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
pdf := pdf';
outputs =| p')
pointers;
rev !outputs, !pdf
in
let nums = ref [] in
iter
(fun xobj ->
let objnum = Pdf.addobj pdf xobj in
nums =| objnum)
form_xobjects';
let image_xobjects' =
match onlyremove with
None -> []
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
in
let newdict =
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
in
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
in
let content' =
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
in
{page with
Pdfpage.content =
(let stream = Pdfops.stream_of_ops content' in
Pdfcodec.encode_pdfstream ~only_if_smaller:true pdf Pdfcodec.Flate stream;
2021-12-21 16:25:59 +01:00
[stream]);
Pdfpage.resources = resources'}, pdf
(* Remove images from all pages in a document. *)
let draft onlyremove boxes range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf
in let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_images_page onlyremove boxes !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')