cpdf-source/cpdfJSON.ml

327 lines
14 KiB
OCaml
Raw Normal View History

2021-10-01 16:21:03 +02:00
(*FIXME failwiths -> Pdf.PDFError or similar *)
(*FIXME flprintfs to eprintf *)
2020-03-04 19:50:32 +01:00
open Pdfutil
2020-01-30 11:42:24 +01:00
module J = Tjjson
2020-01-30 14:10:03 +01:00
module P = Pdf
2020-01-30 15:10:30 +01:00
module O = Pdfops
2020-01-30 11:42:24 +01:00
let sof = Printf.sprintf "%f" (* To prevent "0." *)
2020-02-03 10:37:04 +01:00
let soi = string_of_int
let string_of_float _ = failwith "use sof"
let string_of_int _ = failwith "use soi"
2020-01-31 15:46:33 +01:00
2021-10-01 16:21:03 +02:00
let rec object_of_json = function
| J.Null -> P.Null
| J.Bool b -> P.Boolean b
| J.Number n -> Pdf.Indirect (int_of_string n)
| J.String s -> P.String s
| J.Array objs -> P.Array (map object_of_json objs)
| J.Object ["I", J.Number i] -> P.Integer (int_of_string i)
| J.Object ["F", J.Number f] -> P.Real (float_of_string f)
| J.Object ["N", J.String n] -> P.Name n
| J.Object ["S", J.Array [dict; J.String data]] ->
P.Stream (ref (object_of_json dict, P.Got (Pdfio.bytes_of_string data)))
| J.Object elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json o)) elts)
2020-02-04 17:50:04 +01:00
let rec json_of_object pdf fcs no_stream_data = function
2021-10-01 16:21:03 +02:00
| P.Null -> J.Null
2020-01-30 14:10:03 +01:00
| P.Boolean b -> J.Bool b
2021-10-01 16:21:03 +02:00
| P.Integer i -> J.Object [("I", J.Number (soi i))]
| P.Real r -> J.Object [("F", J.Number (sof r))]
2020-01-30 14:10:03 +01:00
| P.String s -> J.String s
2021-10-01 16:21:03 +02:00
| P.Name n -> J.Object [("N", J.String n)]
2020-03-04 19:50:32 +01:00
| P.Array objs -> J.Array (map (json_of_object pdf fcs no_stream_data) objs)
2020-01-30 14:10:03 +01:00
| P.Dictionary elts ->
2020-03-04 19:50:32 +01:00
iter
(function
2020-07-21 13:28:32 +02:00
("/Contents", P.Indirect i) ->
begin match Pdf.lookup_obj pdf i with
| Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is
| _ -> fcs i
end
2020-03-04 19:50:32 +01:00
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
| _ -> ())
elts;
2020-03-04 19:50:32 +01:00
J.Object (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data v)) elts)
2020-03-25 14:46:54 +01:00
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
P.getstream thestream;
2020-02-01 11:18:15 +01:00
let str =
2020-03-25 14:46:54 +01:00
begin match P.lookup_direct pdf "/FunctionType" d with
| Some _ ->
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
2020-03-25 14:46:54 +01:00
begin match !mut with (_, P.Got b) -> Pdfio.string_of_bytes b | _ -> "failure: decomp" end
| None ->
if no_stream_data then "<<stream data elided>>" else
2020-03-25 14:46:54 +01:00
match stream with P.Got b -> Pdfio.string_of_bytes b | P.ToGet _ -> "failure: toget"
end
2020-02-01 11:18:15 +01:00
in
2021-10-01 16:21:03 +02:00
json_of_object pdf fcs no_stream_data (P.Dictionary [("S", P.Array [P.Dictionary dict; P.String str])])
| P.Stream _ -> J.String "error: stream with not-a-dictionary"
2020-02-04 17:50:04 +01:00
| P.Indirect i ->
2020-03-25 14:46:54 +01:00
begin match P.lookup_obj pdf i with
| P.Stream {contents = (P.Dictionary dict as d, _)} ->
begin match P.lookup_direct pdf "/Subtype" d with
| Some (P.Name "/Form") -> fcs i
2020-02-04 17:50:04 +01:00
| _ -> ()
end
| _ -> ()
end;
J.Number (soi i)
2020-01-30 17:07:40 +01:00
2020-02-04 17:50:04 +01:00
let json_of_op pdf no_stream_data = function
2020-01-30 15:10:30 +01:00
| O.Op_S -> J.Array [J.String "S"]
| O.Op_s -> J.Array [J.String "s"]
| O.Op_f -> J.Array [J.String "f"]
| O.Op_F -> J.Array [J.String "F"]
| O.Op_f' ->J.Array [J.String "f*"]
2020-01-30 15:10:30 +01:00
| O.Op_B -> J.Array [J.String "B"]
| O.Op_B' -> J.Array [J.String "B*"]
2020-01-30 15:10:30 +01:00
| O.Op_b -> J.Array [J.String "b"]
| O.Op_b' -> J.Array [J.String "b*"]
2020-01-30 15:10:30 +01:00
| O.Op_n -> J.Array [J.String "n"]
| O.Op_W -> J.Array [J.String "W"]
| O.Op_W' -> J.Array [J.String "W*"]
2020-01-30 15:10:30 +01:00
| O.Op_BT -> J.Array [J.String "BT"]
| O.Op_ET -> J.Array [J.String "ET"]
2020-01-30 17:07:40 +01:00
| O.Op_q -> J.Array [J.String "q"]
| O.Op_Q -> J.Array [J.String "Q"]
| O.Op_h -> J.Array [J.String "h"]
| O.Op_T' -> J.Array [J.String "T*"]
2020-01-30 17:07:40 +01:00
| O.Op_EMC -> J.Array [J.String "EMC"]
| O.Op_BX -> J.Array [J.String "BX"]
| O.Op_EX -> J.Array [J.String "EX"]
| O.Op_re (a, b, c, d) ->
J.Array [J.Number (sof a); J.Number (sof b); J.Number (sof c); J.Number (sof d); J.String "re"]
| O.Op_k (c, m, y, k) ->
J.Array [J.Number (sof c); J.Number (sof m); J.Number (sof y); J.Number (sof k); J.String "k"]
| O.Op_m (a, b) -> J.Array [J.Number (sof a); J.Number (sof b); J.String "m"]
| O.Op_l (a, b) -> J.Array [J.Number (sof a); J.Number (sof b); J.String "l"]
| O.Op_BDC (s, obj) -> J.Array [J.String s; J.String (Pdfwrite.string_of_pdf obj); J.String "BDC"]
| O.Op_gs s -> J.Array [J.String s; J.String "gs"]
| O.Op_Do s -> J.Array [J.String s; J.String "Do"]
| O.Op_CS s -> J.Array [J.String s; J.String "CS"]
2020-03-04 19:50:32 +01:00
| O.Op_SCN fs -> J.Array ((map (fun x -> J.Number (sof x)) fs) @ [J.String "SCN"])
2020-01-30 17:07:40 +01:00
| O.Op_j j -> J.Array [J.Number (soi j); J.String "j"]
| O.Op_cm t ->
J.Array
[J.Number (sof t.Pdftransform.a);
J.Number (sof t.Pdftransform.b);
J.Number (sof t.Pdftransform.c);
J.Number (sof t.Pdftransform.d);
J.Number (sof t.Pdftransform.e);
J.Number (sof t.Pdftransform.f);
J.String "cm"]
| O.Op_d (fl, y) ->
2020-03-04 19:50:32 +01:00
J.Array [J.Array (map (fun x -> J.Number (sof x)) fl); J.Number (sof y); J.String "d"]
2020-01-30 17:11:29 +01:00
| O.Op_w w -> J.Array [J.Number (sof w); J.String "w"]
2020-01-31 15:24:36 +01:00
| O.Op_J j -> J.Array [J.Number (soi j); J.String "J"]
| O.Op_M m -> J.Array [J.Number (sof m); J.String "M"]
2020-01-31 15:24:36 +01:00
| O.Op_ri s -> J.Array [J.String s; J.String "ri"]
| O.Op_i i -> J.Array [J.Number (soi i); J.String "i"]
| O.Op_c (a, b, c, d, e, k) ->
J.Array
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
J.Number (sof d); J.Number (sof e); J.Number (sof k); J.String "c"]
| O.Op_v (a, b, c, d) ->
J.Array
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
J.Number (sof d); J.String "v"]
| O.Op_y (a, b, c, d) ->
J.Array
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
J.Number (sof d); J.String "y"]
| O.Op_Tc c -> J.Array [J.Number (sof c); J.String "Tc"]
| O.Op_Tw w -> J.Array [J.Number (sof w); J.String "Tw"]
2020-02-01 11:18:15 +01:00
| O.Op_Tz z -> J.Array [J.Number (sof z); J.String "Tz"]
2020-01-31 15:24:36 +01:00
| O.Op_TL l -> J.Array [J.Number (sof l); J.String "TL"]
| O.Op_Tf (k, s) -> J.Array [J.String k; J.Number (sof s); J.String "Tf"]
| O.Op_Tr i -> J.Array [J.Number (soi i); J.String "Tr"]
| O.Op_Ts k -> J.Array [J.Number (sof k); J.String "Ts"]
| O.Op_Td (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "Td"]
| O.Op_TD (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "TD"]
| O.Op_Tm t ->
J.Array
[J.Number (sof t.Pdftransform.a);
J.Number (sof t.Pdftransform.b);
J.Number (sof t.Pdftransform.c);
J.Number (sof t.Pdftransform.d);
J.Number (sof t.Pdftransform.e);
J.Number (sof t.Pdftransform.f);
J.String "Tm"]
| O.Op_Tj s -> J.Array [J.String s; J.String "Tj"]
2020-02-04 17:50:04 +01:00
| O.Op_TJ pdfobject -> J.Array [json_of_object pdf (fun _ -> ()) no_stream_data pdfobject; J.String "TJ"]
2020-01-31 15:24:36 +01:00
| O.Op_' s -> J.Array [J.String s; J.String "'"]
| O.Op_'' (k, k', s) -> J.Array [J.Number (sof k); J.Number (sof k'); J.String s; J.String "''"]
| O.Op_d0 (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "d0"]
| O.Op_d1 (a, b, c, d, e, k) ->
J.Array
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
J.Number (sof d); J.Number (sof e); J.Number (sof k); J.String "d1"]
| O.Op_cs s -> J.Array [J.String s; J.String "cs"]
2020-03-04 19:50:32 +01:00
| O.Op_SC fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "SC"])
| O.Op_sc fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "sc"])
| O.Op_scn fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "scn"])
2020-01-31 15:46:33 +01:00
| O.Op_G k -> J.Array [J.Number (sof k); J.String "G"]
| O.Op_g k -> J.Array [J.Number (sof k); J.String "g"]
| O.Op_RG (r, g, b) -> J.Array [J.Number (sof r); J.Number (sof g); J.Number (sof b); J.String "RG"]
| O.Op_rg (r, g, b) -> J.Array [J.Number (sof r); J.Number (sof g); J.Number (sof b); J.String "rg"]
| O.Op_K (c, m, y, k) -> J.Array [J.Number (sof c); J.Number (sof m); J.Number (sof y); J.Number (sof k); J.String "K"]
| O.Op_sh s -> J.Array [J.String s; J.String "sh"]
| O.Op_MP s -> J.Array [J.String s; J.String "MP"]
| O.Op_BMC s -> J.Array [J.String s; J.String "BMC"]
| O.Op_Unknown _ -> J.Array [J.String "Unknown"]
| O.Op_SCNName (s, fs) ->
2020-03-04 19:50:32 +01:00
J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "SCNName"])
2020-01-31 15:46:33 +01:00
| O.Op_scnName (s, fs) ->
2020-03-04 19:50:32 +01:00
J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "scnName"])
2020-02-04 17:50:04 +01:00
| O.InlineImage (dict, data) -> J.Array [json_of_object pdf (fun _ -> ()) no_stream_data dict; J.String (Pdfio.string_of_bytes data)]
| O.Op_DP (s, obj) -> J.Array [J.String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; J.String "DP"]
2020-01-30 15:10:30 +01:00
2020-01-31 13:17:55 +01:00
(* parse_stream needs pdf and resources. These are for lexing of inline images,
* looking up the colourspace. We do not need to worry about inherited
* resources, though? For now, don't worry about inherited resources: check in
2021-06-21 16:03:32 +02:00
* PDF standard. *)
2020-01-31 13:17:55 +01:00
let parse_content_stream pdf resources bs =
2020-03-25 14:46:54 +01:00
let ops = O.parse_stream pdf resources [bs] in
2020-03-04 19:50:32 +01:00
J.Array (map (json_of_op pdf false) ops)
2020-01-30 14:10:03 +01:00
2021-06-21 16:03:32 +02:00
(* We need to make sure each page only has one page content stream. Otherwise,
if not split on op boundaries, each one would fail to parse on its own. *)
(* Future improvement. Don't blow up shared content streams. *)
let precombine_page_content pdf =
let pages' =
map
(fun page ->
match page.Pdfpage.content with
[] | [_] -> page
| _ ->
let operators =
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
in
{page with Pdfpage.content = [Pdfops.stream_of_ops operators]}
)
(Pdfpage.pages_of_pagetree pdf)
in
Pdfpage.change_pages true pdf pages'
2020-02-01 11:18:15 +01:00
let json_of_pdf parse_content no_stream_data pdf =
2021-06-21 16:03:32 +02:00
let pdf = if parse_content then precombine_page_content pdf else pdf in
Pdf.remove_unreferenced pdf;
2020-03-25 14:46:54 +01:00
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data pdf.P.trailerdict) in
2021-10-01 13:16:55 +02:00
let parameters =
(-1, json_of_object pdf (fun x -> ()) false
2021-10-01 16:21:03 +02:00
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
2021-10-01 13:16:55 +02:00
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
2021-10-01 16:21:03 +02:00
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major);
("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor);
]))
2021-10-01 13:16:55 +02:00
in
2020-01-30 14:10:03 +01:00
let content_streams = ref [] in
2020-01-31 13:17:55 +01:00
let fcs n = content_streams := n::!content_streams in
2020-01-30 14:10:03 +01:00
let pairs =
let ps = ref [] in
2020-03-25 14:46:54 +01:00
P.objiter
2020-01-30 14:10:03 +01:00
(fun i pdfobj ->
2020-02-04 17:50:04 +01:00
ps := (i, json_of_object pdf fcs no_stream_data pdfobj)::!ps)
2020-01-30 14:10:03 +01:00
pdf;
2021-10-01 13:16:55 +02:00
parameters::trailerdict::!ps
2020-01-30 14:10:03 +01:00
in
2020-01-31 15:46:33 +01:00
if parse_content then
2020-03-25 14:46:54 +01:00
iter (fun n -> Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)) !content_streams;
2020-01-30 14:10:03 +01:00
let pairs_parsed =
if not parse_content then pairs else
2020-03-04 19:50:32 +01:00
map
2020-01-30 14:10:03 +01:00
(fun (objnum, obj) ->
2020-03-04 19:50:32 +01:00
if mem objnum !content_streams then
2020-01-30 14:10:03 +01:00
begin match obj with
2021-10-01 16:46:42 +02:00
| J.Object ["S", J.Array [dict; J.String _]] ->
2021-06-21 16:03:32 +02:00
(* FIXME Proper resources here for reasons explained above? *)
2020-01-31 13:17:55 +01:00
let streamdata =
2020-03-25 14:46:54 +01:00
match P.lookup_obj pdf objnum with
| P.Stream {contents = (_, P.Got b)} -> b
2020-01-31 13:17:55 +01:00
| _ -> failwith "JSON: stream not decoded"
in
2021-10-01 16:46:42 +02:00
(objnum, J.Object ["S", J.Array [dict; parse_content_stream pdf (P.Dictionary []) streamdata]])
2020-01-30 14:10:03 +01:00
| _ -> failwith "json_of_pdf: stream parsing inconsistency"
end
else
(objnum, obj))
pairs
in
J.Array
2020-03-04 19:50:32 +01:00
(map
2020-02-03 10:37:04 +01:00
(fun (objnum, jsonobj) -> J.Array [J.Number (soi objnum); jsonobj])
2020-01-30 14:10:03 +01:00
pairs_parsed)
2020-01-30 11:42:24 +01:00
2021-10-01 13:53:21 +02:00
let pdf_of_json json =
2021-10-01 16:21:03 +02:00
(*flprint (J.show json);
flprint "\n";*)
let objs = match json with J.Array objs -> objs | _ -> failwith "bad json top level" in
let params = ref Pdf.Null in
let trailerdict = ref Pdf.Null in
let objects =
option_map
(function
| J.Array [J.Number n; o] ->
let objnum = int_of_string n in
begin match objnum with
| -1 -> params := object_of_json o; None
| 0 -> trailerdict := object_of_json o; None
| n when n < 0 -> None
| n -> Some (n, object_of_json o)
end
| _ -> failwith "json bad obj")
objs
in
2021-10-01 16:46:42 +02:00
(*List.
iter (fun (i, o) -> flprint (soi i); flprint "\n"; flprint (Pdfwrite.string_of_pdf o); flprint "\n") objects;*)
begin match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONstreamdataincluded" !params with
| Some (Pdf.Boolean false) -> failwith "no stream data; cannot reconstruct PDF"
| _ -> ()
end;
2021-10-01 16:21:03 +02:00
let major =
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONmajorpdfversion" !params with
Some (Pdf.Integer i) -> i | _ -> failwith "bad major version"
in
let minor =
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONminorpdfversion" !params with
Some (Pdf.Integer i) -> i | _ -> failwith "bad minor version"
in
(*flprint (Pdfwrite.string_of_pdf !trailerdict);*)
let root =
match !trailerdict with Pdf.Dictionary d ->
begin match lookup "/Root" d with
Some (Pdf.Indirect i) -> i | _ -> failwith "bad root"
end
| _ -> failwith "bad root 2"
in
let objmap = P.pdfobjmap_empty () in
List.iter (fun (k, v) -> Hashtbl.add objmap k (ref (P.Parsed v), 0)) objects;
2021-10-01 13:53:21 +02:00
let objects =
{P.maxobjnum = 0;
P.parse = None;
P.pdfobjects = objmap;
P.object_stream_ids = Hashtbl.create 0}
in
{P.major;
P.minor;
P.root;
P.objects;
2021-10-01 16:21:03 +02:00
P.trailerdict = !trailerdict;
2021-10-01 13:53:21 +02:00
P.was_linearized = false;
P.saved_encryption = None}
(* FIXME Proper streaming to output / from input, rather than making a big string first. *)
2021-10-01 13:16:55 +02:00
let to_output o parse_content no_stream_data pdf =
2020-01-30 11:42:24 +01:00
let b = Buffer.create 256 in
let formatter = Format.formatter_of_buffer b in
2020-03-25 14:46:54 +01:00
J.format formatter (json_of_pdf parse_content no_stream_data pdf);
2020-01-30 11:42:24 +01:00
Format.pp_print_flush formatter ();
2021-10-01 13:16:55 +02:00
o.Pdfio.output_string (Buffer.contents b)
2021-10-01 13:53:21 +02:00
let of_input i =
pdf_of_json (J.parse (Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 (i.Pdfio.in_channel_length))))