-list-images finished

This commit is contained in:
John Whitington 2023-11-14 17:47:44 +00:00
parent ab405a7330
commit 24f899e346
2 changed files with 82 additions and 48 deletions

View File

@ -8,6 +8,10 @@ let version_date = "(patch 2, 25th October 2023)"
open Pdfutil open Pdfutil
open Pdfio open Pdfio
let combine_with_spaces strs =
String.trim
(fold_left (fun x y -> x ^ (if x <> "" then " " else "") ^ y) "" strs)
let tempfiles = ref [] let tempfiles = ref []
let exit n = let exit n =
@ -4183,7 +4187,16 @@ let go () =
if args.format_json then if args.format_json then
flprint (Cpdfyojson.Safe.pretty_to_string json) flprint (Cpdfyojson.Safe.pretty_to_string json)
else else
flprint "old fashioned output\n" begin match json with
| `List l ->
iter
(function (`Assoc [(_, `Int i); (_, `List pages); (_, `String name); (_, `Int w); (_, `Int h); (_, `String cs)]) ->
let pages = combine_with_spaces (map (function `Int i -> string_of_int i | _ -> "") pages) in
flprint (Printf.sprintf "%i, %s, %s, %i, %i, %s\n" i pages name w h cs)
| _ -> ())
l
| _ -> ()
end
| Some MissingFonts -> | Some MissingFonts ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in

View File

@ -272,23 +272,18 @@ let image_resolution pdf range dpi =
image_resolution pdf range dpi; image_resolution pdf range dpi;
rev !image_results rev !image_results
(* FIXME Add colourspaces and anything else relevant *)
(* All the images in file referenced at least once from the given range of pages. *) (* All the images in file referenced at least once from the given range of pages. *)
let images pdf range = let images pdf range =
let images = null_hash () in let images = null_hash () in
Cpdfpage.iter_pages let formnums = null_hash () in
(fun pagenum page -> let rec process_xobject resources pagenum page (name, xobject) =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjects) ->
iter
(function (name, xobject) ->
match Pdf.lookup_direct pdf "/Subtype" xobject with match Pdf.lookup_direct pdf "/Subtype" xobject with
| Some (Pdf.Name "/Image") -> | Some (Pdf.Name "/Image") ->
begin match xobject with begin match xobject with
| Pdf.Indirect i -> | Pdf.Indirect i ->
begin match Hashtbl.find images i with begin match Hashtbl.find images i with
| (pagenums, n, w, h) -> | (pagenums, n, w, h, cs) ->
Hashtbl.replace images i (pagenum::pagenums, n, w, h) Hashtbl.replace images i (pagenum::pagenums, n, w, h, cs)
| exception Not_found -> | exception Not_found ->
let width = let width =
match Pdf.lookup_direct pdf "/Width" xobject with match Pdf.lookup_direct pdf "/Width" xobject with
@ -298,29 +293,55 @@ let images pdf range =
match Pdf.lookup_direct pdf "/Height" xobject with match Pdf.lookup_direct pdf "/Height" xobject with
| Some x -> Pdf.getnum pdf x | Some x -> Pdf.getnum pdf x
| None -> 1. | None -> 1.
and colourspace =
match Pdf.lookup_direct pdf "/ColorSpace" xobject with
| Some x -> Some (Pdfspace.string_of_colourspace (Pdfspace.read_colourspace pdf resources x))
| None -> None
in in
Hashtbl.replace images i ([pagenum], name, int_of_float width, int_of_float height) Hashtbl.replace images i ([pagenum], name, int_of_float width, int_of_float height, colourspace)
end end
| _ -> () | _ -> ()
end end
(* FIXME Look into form xobjects recursively *) | Some (Pdf.Name "/Form") ->
| _ -> ()) begin match xobject with
xobjects | Pdf.Indirect i ->
begin match Hashtbl.find formnums i with
| () -> ()
| exception Not_found ->
Hashtbl.add formnums i ();
begin match Pdf.lookup_direct pdf "/Resources" xobject with
| Some r ->
begin match Pdf.lookup_direct pdf "/XObject" r with
| Some (Pdf.Dictionary xobjects) -> iter (process_xobject r pagenum page) xobjects
| _ -> ()
end
| None -> ()
end
end
| _ -> ()
end
| _ -> ()
in
Cpdfpage.iter_pages
(fun pagenum page ->
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjects) ->
iter (process_xobject page.Pdfpage.resources pagenum page) xobjects
| _ -> ()) | _ -> ())
pdf pdf
range; range;
(* Sort page numbers, then sort by first page number appearing, and build JSON structure *)
let images = list_of_hashtbl images in let images = list_of_hashtbl images in
let images = map (fun (i, (pnums, n, w, h)) -> (i, (setify (sort compare pnums), n, w, h))) images in let images = map (fun (i, (pnums, n, w, h, c)) -> (i, (setify (sort compare pnums), n, w, h, c))) images in
let images = sort (fun (_, (pnums, _, _, _)) (_, (pnums', _, _, _)) -> compare (hd pnums) (hd pnums')) images in let images = sort (fun (_, (pnums, _, _, _, _)) (_, (pnums', _, _, _, _)) -> compare (hd pnums) (hd pnums')) images in
`List `List
(map (map
(fun (i, (pnums, n, w, h)) -> (fun (i, (pnums, n, w, h, cs)) ->
`Assoc [("Object", `Int i); `Assoc [("Object", `Int i);
("Pages", `List (map (fun x -> `Int x) pnums)); ("Pages", `List (map (fun x -> `Int x) pnums));
("Path", `String n); ("Name", `String n);
("Width", `Int w); ("Width", `Int w);
("Height", `Int h)]) ("Height", `Int h);
("Colourspace", match cs with None -> `Null | Some s -> `String s)])
images) images)
let obj_of_jpeg_data data = let obj_of_jpeg_data data =