More splitting up of source files

This commit is contained in:
John Whitington 2021-11-12 14:50:31 -08:00
parent f6631aa101
commit 4e12e01848
3 changed files with 184 additions and 173 deletions

161
cpdf.ml
View File

@ -4488,3 +4488,164 @@ let image_resolution pdf range dpi =
image_results := [];
image_resolution pdf range dpi;
rev !image_results
(* copy the contents of the box f to the box t. If mediabox_if_missing is set,
the contents of the mediabox will be used if the from fox is not available. If
mediabox_is_missing is false, the page is unaltered. *)
let copy_box f t mediabox_if_missing pdf range =
process_pages
(ppstub (fun _ page ->
if f = "/MediaBox" then
{page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (page.Pdfpage.mediabox))}
else
match Pdf.lookup_direct pdf f page.Pdfpage.rest with
| Some pdfobject ->
if t = "/MediaBox"
then {page with
Pdfpage.mediabox = Pdf.direct pdf pdfobject}
else {page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (Pdf.direct pdf pdfobject))}
| None ->
if mediabox_if_missing
then {page with Pdfpage.rest = Pdf.add_dict_entry page.Pdfpage.rest t page.Pdfpage.mediabox}
else page))
pdf
range
let dump_attachment out pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
let efdata =
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
Pdfcodec.decode_pdfstream_until_unknown pdf stream;
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
in
let s = remove_unsafe_characters s in
let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in
begin try
let fh = open_out_bin filename in
for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done;
close_out fh
with
e -> Printf.eprintf "Failed to write attachment to %s\n%!" filename;
end
| _ -> ()
let dump_attached_document pdf out =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x)
| None -> ()
let dump_attached_page pdf out page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots)
(* Dump both document-level and page-level attached files to file, using their file names *)
let dump_attached_files pdf out =
try
dump_attached_document pdf out;
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
with
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
let remove_unused_resources_page pdf n page =
let xobjects, all_names =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary d) -> Pdf.Dictionary d, map fst d
| _ -> Pdf.Dictionary [], []
in
let names_to_keep =
option_map
(function Pdfops.Op_Do n -> Some n | _ -> None)
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
in
let names_to_remove = lose (mem' names_to_keep) all_names in
let xobjdict = fold_left (Pdf.remove_dict_entry) xobjects names_to_remove in
{page with Pdfpage.resources = Pdf.add_dict_entry page.Pdfpage.resources "/XObject" xobjdict}
let remove_unused_resources pdf =
process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
let print_spot_colour n s =
Printf.printf "%i %s\n" n s
let list_spot_colours pdf =
Pdf.objiter
(fun _ obj ->
match obj with
Pdf.Array (Pdf.Name "/Separation"::x::_) ->
begin match Pdf.direct pdf x with
Pdf.Name col -> Printf.printf "%s\n" col
| _ -> ()
end
| _ -> ())
pdf
(* Indent bookmarks in each file by one and add a title bookmark pointing to the first page. *)
let add_bookmark_title filename use_title pdf =
let title =
if use_title then
match get_info_utf8 pdf "/Title", get_xmp_info pdf "/Title" with
"", x | x, "" | _, x -> x
else
Filename.basename filename
in
let marks = Pdfmarks.read_bookmarks pdf in
let page1objnum =
match Pdfpage.page_object_number pdf 1 with
None -> error "add_bookmark_title: page not found"
| Some x -> x
in
let newmarks =
{Pdfmarks.level = 0;
Pdfmarks.text = title;
Pdfmarks.target = Pdfdest.XYZ (Pdfdest.PageObject page1objnum, None, None, None);
Pdfmarks.isopen = false}
::map (function m -> {m with Pdfmarks.level = m.Pdfmarks.level + 1}) marks
in
Pdfmarks.add_bookmarks newmarks pdf
let bookmarks_open_to_level n pdf =
let marks = Pdfmarks.read_bookmarks pdf in
let newmarks =
map
(fun m -> {m with Pdfmarks.isopen = m.Pdfmarks.level < n})
marks
in
Pdfmarks.add_bookmarks newmarks pdf
let create_pdf pages pagesize =
let page =
{(Pdfpage.blankpage pagesize) with
Pdfpage.content = [Pdfops.stream_of_ops []];
Pdfpage.resources = Pdf.Dictionary []}
in
let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf

View File

@ -397,3 +397,18 @@ val print_dict_entry : Pdf.t -> string -> unit
val remove_clipping : Pdf.t -> int list -> Pdf.t
val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * float * float) list
val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t
val dump_attached_files : Pdf.t -> string -> unit
val add_bookmark_title : string -> bool -> Pdf.t -> Pdf.t
val remove_unused_resources : Pdf.t -> Pdf.t
val list_spot_colours : Pdf.t -> unit
val bookmarks_open_to_level : int -> Pdf.t -> Pdf.t
val create_pdf : int -> Pdfpaper.t -> Pdf.t

View File

@ -915,9 +915,6 @@ let anon_fun s =
(* If a password begins with a dash, we allow -pw=<password> too *)
let setdashpassword = anon_fun
(*let firstpage pdf =
List.hd (Pdfpage.pages_of_pagetree pdf)*)
(* Setting operations *)
let setcrop s =
setop Crop ();
@ -3056,93 +3053,6 @@ let getencryption pdf =
| Some (Pdfwrite.AES256bitISO true) -> "256bit AES ISO, Metadata encrypted"
| Some (Pdfwrite.AES256bitISO false) -> "256bit AES ISO, Metadata not encrypted"
(* copy the contents of the box f to the box t. If mediabox_if_missing is set,
the contents of the mediabox will be used if the from fox is not available. If
mediabox_is_missing is false, the page is unaltered. *)
let copy_box f t mediabox_if_missing pdf range =
Cpdf.process_pages
(ppstub (fun _ page ->
if f = "/MediaBox" then
{page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (page.Pdfpage.mediabox))}
else
match Pdf.lookup_direct pdf f page.Pdfpage.rest with
| Some pdfobject ->
if t = "/MediaBox"
then {page with
Pdfpage.mediabox = Pdf.direct pdf pdfobject}
else {page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (Pdf.direct pdf pdfobject))}
| None ->
if mediabox_if_missing
then {page with Pdfpage.rest = Pdf.add_dict_entry page.Pdfpage.rest t page.Pdfpage.mediabox}
else page))
pdf
range
let dump_attachment out pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
let efdata =
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
Pdfcodec.decode_pdfstream_until_unknown pdf stream;
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
in
let s = remove_unsafe_characters s in
let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in
begin try
let fh = open_out_bin filename in
for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done;
close_out fh
with
e -> Printf.eprintf "Failed to write attachment to %s\n%!" filename;
end
| _ -> ()
let dump_attached_document pdf out =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x)
| None -> ()
let dump_attached_page pdf out page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots)
(* Dump both document-level and page-level attached files to file, using their file names *)
let dump_attached_files pdf out =
try
dump_attached_document pdf out;
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
with
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
(* If pages in stamp < pages in main, extend stamp by repeating its last page. If pages in stamp more, chop stamp *)
let equalize_pages_extend main stamp =
let length_stamp = Pdfpage.endpage stamp
@ -3156,81 +3066,6 @@ let equalize_pages_extend main stamp =
then chop stamp length_main
else extend_lastpage (last (Pdfpage.pages_of_pagetree stamp)) stamp (length_main - length_stamp)
let remove_unused_resources_page pdf n page =
let xobjects, all_names =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary d) -> Pdf.Dictionary d, map fst d
| _ -> Pdf.Dictionary [], []
in
let names_to_keep =
option_map
(function Pdfops.Op_Do n -> Some n | _ -> None)
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
in
let names_to_remove = lose (mem' names_to_keep) all_names in
let xobjdict = fold_left (Pdf.remove_dict_entry) xobjects names_to_remove in
{page with Pdfpage.resources = Pdf.add_dict_entry page.Pdfpage.resources "/XObject" xobjdict}
let remove_unused_resources pdf =
Cpdf.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
let print_spot_colour n s =
Printf.printf "%i %s\n" n s
let list_spot_colours pdf =
Pdf.objiter
(fun _ obj ->
match obj with
Pdf.Array (Pdf.Name "/Separation"::x::_) ->
begin match Pdf.direct pdf x with
Pdf.Name col -> Printf.printf "%s\n" col
| _ -> ()
end
| _ -> ())
pdf
(* Indent bookmarks in each file by one and add a title bookmark pointing to the first page. *)
let add_bookmark_title filename use_title pdf =
let title =
if use_title then
match Cpdf.get_info_utf8 pdf "/Title", Cpdf.get_xmp_info pdf "/Title" with
"", x | x, "" | _, x -> x
else
Filename.basename filename
in
let marks = Pdfmarks.read_bookmarks pdf in
let page1objnum =
match Pdfpage.page_object_number pdf 1 with
None -> error "add_bookmark_title: page not found"
| Some x -> x
in
let newmarks =
{Pdfmarks.level = 0;
Pdfmarks.text = title;
Pdfmarks.target = Pdfdest.XYZ (Pdfdest.PageObject page1objnum, None, None, None);
Pdfmarks.isopen = false}
::map (function m -> {m with Pdfmarks.level = m.Pdfmarks.level + 1}) marks
in
Pdfmarks.add_bookmarks newmarks pdf
let bookmarks_open_to_level n pdf =
let marks = Pdfmarks.read_bookmarks pdf in
let newmarks =
map
(fun m -> {m with Pdfmarks.isopen = m.Pdfmarks.level < n})
marks
in
Pdfmarks.add_bookmarks newmarks pdf
let create_pdf pages pagesize =
let page =
{(Pdfpage.blankpage args.createpdf_pagesize) with
Pdfpage.content = [Pdfops.stream_of_ops []];
Pdfpage.resources = Pdf.Dictionary []}
in
let pdf, pageroot = Pdfpage.add_pagetree (many page args.createpdf_pages) (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf
let write_json output pdf =
match output with
| NoOutputSpecified ->
@ -3317,7 +3152,7 @@ let go () =
let pdfs =
if args.merge_add_bookmarks then
map2
(fun filename pdf -> add_bookmark_title filename args.merge_add_bookmarks_use_titles pdf)
(fun filename pdf -> Cpdf.add_bookmark_title filename args.merge_add_bookmarks_use_titles pdf)
(map (function InFile s -> s | StdIn -> "" | AlreadyInMemory _ -> "") names)
pdfs
else
@ -3348,7 +3183,7 @@ let go () =
begin match args.inputs, args.out with
| _::_, _ ->
let pdf = get_single_pdf (Some RemoveUnusedResources) false in
let outpdf = remove_unused_resources pdf in
let outpdf = Cpdf.remove_unused_resources pdf in
write_pdf true outpdf
| _ -> error "RemoveUnusedResources: bad command line"
end
@ -3523,7 +3358,7 @@ let go () =
| _ -> error "Copy box: no tobox or no frombox specified"
end
in
let pdf = copy_box f t args.mediabox_if_missing pdf range in
let pdf = Cpdf.copy_box f t args.mediabox_if_missing pdf range in
write_pdf false pdf
| _ -> error "Copy Box: bad command line"
end
@ -3866,8 +3701,8 @@ let go () =
| Some DumpAttachedFiles ->
let pdf = get_single_pdf args.op false in
begin match args.out with
| NoOutputSpecified -> dump_attached_files pdf ""
| File n -> dump_attached_files pdf n
| NoOutputSpecified -> Cpdf.dump_attached_files pdf ""
| File n -> Cpdf.dump_attached_files pdf n
| Stdout -> error "Can't dump attachments to stdout"
end
| Some RemoveAttachedFiles ->
@ -4109,7 +3944,7 @@ let go () =
Cpdf.print_dict_entry pdf key
| Some ListSpotColours ->
let pdf = get_single_pdf args.op false in
list_spot_colours pdf
Cpdf.list_spot_colours pdf
| Some RemoveClipping ->
let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
@ -4131,9 +3966,9 @@ let go () =
Cpdffont.embed_missing_fonts args.path_to_ghostscript args.gs_quiet fi fo
| Some (BookmarksOpenToLevel n) ->
let pdf = get_single_pdf args.op false in
write_pdf false (bookmarks_open_to_level n pdf)
write_pdf false (Cpdf.bookmarks_open_to_level n pdf)
| Some CreatePDF ->
let pdf = create_pdf args.createpdf_pages args.createpdf_pagesize in
let pdf = Cpdf.create_pdf args.createpdf_pages args.createpdf_pagesize in
write_pdf false pdf
| Some RemoveAllText ->
let pdf = get_single_pdf args.op false in