open Pdfutil open Pdfio (* Making draft documents *) (* Predicate on an xobject: true if an image xobject. *) let isimage pdf (_, xobj) = match Pdf.lookup_direct pdf "/Subtype" xobj with | Some (Pdf.Name "/Image") -> true | _ -> false (* Given a set of resources for a page, and the name of a resource, determine if that name refers to an image xobject. *) let xobject_isimage pdf resources name = match resources with | Pdf.Dictionary _ -> begin match Pdf.lookup_direct pdf "/XObject" resources with | Some xobjects -> isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects) | _ -> false end | _ -> failwith "bad resources" (* The subsitute for an image. *) let substitute boxes = if boxes then rev [Pdfops.Op_q; Pdfops.Op_w 0.; Pdfops.Op_G 0.; Pdfops.Op_re (0., 0., 1., 1.); Pdfops.Op_m (0., 0.); Pdfops.Op_l (1., 1.); Pdfops.Op_m (0., 1.); Pdfops.Op_l (1., 0.); Pdfops.Op_S; Pdfops.Op_Q] else [] (* Remove references to images from a graphics stream. *) let rec remove_images_stream onlyremove boxes pdf resources prev = function | [] -> rev prev | (Pdfops.Op_Do name) as h::t -> if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name) then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t else remove_images_stream onlyremove boxes pdf resources (h::prev) t | Pdfops.InlineImage _ as h::t -> if onlyremove <> None then remove_images_stream onlyremove boxes pdf resources (h::prev) t else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t | h::t -> remove_images_stream onlyremove boxes pdf resources (h::prev) t let rec process_form_xobject onlyremove boxes pdf form = let form = Pdf.direct pdf form in let page = {Pdfpage.content = [form]; Pdfpage.mediabox = Pdf.Null; Pdfpage.resources = begin match Pdf.lookup_direct pdf "/Resources" form with | Some r -> r | None -> Pdf.Dictionary [] end; Pdfpage.rotate = Pdfpage.Rotate0; Pdfpage.rest = Pdf.Dictionary []} in let page', pdf = remove_images_page onlyremove boxes pdf page in let form' = match form with | Pdf.Stream {contents = (dict, _)} -> begin match Pdfops.stream_of_ops (Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content) with | Pdf.Stream {contents = (_, Pdf.Got data)} -> let dict' = Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data)) in Pdf.Stream {contents = (dict', Pdf.Got data)} | _ -> assert false end | _ -> raise (Pdf.PDFError "not a stream") in form', pdf (* Remove images from a page. *) and remove_images_page onlyremove boxes pdf page = let isform pdf xobj = match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false in let isimage pdf xobj = match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false in (* Remove image xobjects and look into form ones *) let form_xobjects, image_xobjects = match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with | Some (Pdf.Dictionary elts) -> keep (function (_, p) -> isform pdf p) elts, keep (function (_, p) -> isimage pdf p) elts | _ -> [], [] in let resources', pdf = let names, pointers = split form_xobjects in let form_xobjects', pdf = let pdf = ref pdf in let outputs = ref [] in iter (fun p -> let p', pdf' = process_form_xobject onlyremove boxes !pdf p in pdf := pdf'; outputs =| p') pointers; rev !outputs, !pdf in let nums = ref [] in iter (fun xobj -> let objnum = Pdf.addobj pdf xobj in nums =| objnum) form_xobjects'; let image_xobjects' = match onlyremove with None -> [] | Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects in let newdict = Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums))) in Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf in let content' = remove_images_stream onlyremove boxes pdf page.Pdfpage.resources [] (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content) in {page with Pdfpage.content = (let stream = Pdfops.stream_of_ops content' in Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream; [stream]); Pdfpage.resources = resources'}, pdf (* Remove images from all pages in a document. *) let draft onlyremove boxes range pdf = let pages = Pdfpage.pages_of_pagetree pdf in let pagenums = indx pages in let pdf = ref pdf in let pages' = ref [] in iter2 (fun p pagenum -> let p', pdf' = if mem pagenum range then remove_images_page onlyremove boxes !pdf p else p, !pdf in pdf := pdf'; pages' =| p') pages pagenums; Pdfpage.change_pages true !pdf (rev !pages')