From b671b637bd064032c634c54a8b7a075c7fbb51ca Mon Sep 17 00:00:00 2001 From: John Whitington Date: Wed, 6 Oct 2021 13:52:05 +0100 Subject: [PATCH] yojson writing --- cpdfjson.ml | 482 ++++++++++++++++++++++++++-------------------------- 1 file changed, 245 insertions(+), 237 deletions(-) diff --git a/cpdfjson.ml b/cpdfjson.ml index ba037af..8a4b78b 100644 --- a/cpdfjson.ml +++ b/cpdfjson.ml @@ -2,13 +2,8 @@ open Pdfutil open Cpdferror (*module J = Tjjson -module P = Pdf -module O = Pdfops -let sof = Printf.sprintf "%f" (* To prevent "0." *) -let soi = string_of_int -let string_of_float _ = error "use sof" -let string_of_int _ = error "use soi" + let opf = function | J.Object ["F", J.Number f] -> float_of_string f @@ -129,238 +124,8 @@ and object_of_json = function Pdfops.stream_of_ops (List.map op_of_json parsed_ops) | J.Object elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json o)) elts) -let rec json_of_object pdf fcs no_stream_data = function - | P.Null -> J.Null - | P.Boolean b -> J.Bool b - | P.Integer i -> J.Object [("I", J.Number (soi i))] - | P.Real r -> J.Object [("F", J.Number (sof r))] - | P.String s -> J.String s - | P.Name n -> J.Object [("N", J.String n)] - | P.Array objs -> J.Array (map (json_of_object pdf fcs no_stream_data) objs) - | P.Dictionary elts -> - iter - (function - ("/Contents", P.Indirect i) -> - begin match Pdf.lookup_obj pdf i with - | Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is - | _ -> fcs i - end - | ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts - | _ -> ()) - elts; - J.Object (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data v)) elts) - | P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream -> - P.getstream thestream; - let str = - begin match P.lookup_direct pdf "/FunctionType" d with - | Some _ -> - Pdfcodec.decode_pdfstream_until_unknown pdf thestream; - begin match !mut with (_, P.Got b) -> Pdfio.string_of_bytes b | _ -> "failure: decomp" end - | None -> - if no_stream_data then "<>" else - match stream with P.Got b -> Pdfio.string_of_bytes b | P.ToGet _ -> "failure: toget" - end - in - json_of_object pdf fcs no_stream_data (P.Dictionary [("S", P.Array [P.Dictionary dict; P.String str])]) - | P.Stream _ -> J.String "error: stream with not-a-dictionary" - | P.Indirect i -> - begin match P.lookup_obj pdf i with - | P.Stream {contents = (P.Dictionary dict as d, _)} -> - begin match P.lookup_direct pdf "/Subtype" d with - | Some (P.Name "/Form") -> fcs i - | _ -> () - end - | _ -> () - end; - J.Number (soi i) -let json_of_op pdf no_stream_data = function - | O.Op_S -> J.Array [J.String "S"] - | O.Op_s -> J.Array [J.String "s"] - | O.Op_f -> J.Array [J.String "f"] - | O.Op_F -> J.Array [J.String "F"] - | O.Op_f' ->J.Array [J.String "f*"] - | O.Op_B -> J.Array [J.String "B"] - | O.Op_B' -> J.Array [J.String "B*"] - | O.Op_b -> J.Array [J.String "b"] - | O.Op_b' -> J.Array [J.String "b*"] - | O.Op_n -> J.Array [J.String "n"] - | O.Op_W -> J.Array [J.String "W"] - | O.Op_W' -> J.Array [J.String "W*"] - | O.Op_BT -> J.Array [J.String "BT"] - | O.Op_ET -> J.Array [J.String "ET"] - | O.Op_q -> J.Array [J.String "q"] - | O.Op_Q -> J.Array [J.String "Q"] - | O.Op_h -> J.Array [J.String "h"] - | O.Op_T' -> J.Array [J.String "T*"] - | O.Op_EMC -> J.Array [J.String "EMC"] - | O.Op_BX -> J.Array [J.String "BX"] - | O.Op_EX -> J.Array [J.String "EX"] - | O.Op_re (a, b, c, d) -> - J.Array [J.Number (sof a); J.Number (sof b); J.Number (sof c); J.Number (sof d); J.String "re"] - | O.Op_k (c, m, y, k) -> - J.Array [J.Number (sof c); J.Number (sof m); J.Number (sof y); J.Number (sof k); J.String "k"] - | O.Op_m (a, b) -> J.Array [J.Number (sof a); J.Number (sof b); J.String "m"] - | O.Op_l (a, b) -> J.Array [J.Number (sof a); J.Number (sof b); J.String "l"] - | O.Op_BDC (s, obj) -> J.Array [J.String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; J.String "BDC"] - | O.Op_gs s -> J.Array [J.String s; J.String "gs"] - | O.Op_Do s -> J.Array [J.String s; J.String "Do"] - | O.Op_CS s -> J.Array [J.String s; J.String "CS"] - | O.Op_SCN fs -> J.Array ((map (fun x -> J.Number (sof x)) fs) @ [J.String "SCN"]) - | O.Op_j j -> J.Array [J.Number (soi j); J.String "j"] - | O.Op_cm t -> - J.Array - [J.Number (sof t.Pdftransform.a); J.Number (sof t.Pdftransform.b); J.Number (sof t.Pdftransform.c); - J.Number (sof t.Pdftransform.d); J.Number (sof t.Pdftransform.e); J.Number (sof t.Pdftransform.f); - J.String "cm"] - | O.Op_d (fl, y) -> - J.Array [J.Array (map (fun x -> J.Number (sof x)) fl); J.Number (sof y); J.String "d"] - | O.Op_w w -> J.Array [J.Number (sof w); J.String "w"] - | O.Op_J j -> J.Array [J.Number (soi j); J.String "J"] - | O.Op_M m -> J.Array [J.Number (sof m); J.String "M"] - | O.Op_ri s -> J.Array [J.String s; J.String "ri"] - | O.Op_i i -> J.Array [J.Number (soi i); J.String "i"] - | O.Op_c (a, b, c, d, e, k) -> - J.Array - [J.Number (sof a); J.Number (sof b); J.Number (sof c); - J.Number (sof d); J.Number (sof e); J.Number (sof k); J.String "c"] - | O.Op_v (a, b, c, d) -> - J.Array - [J.Number (sof a); J.Number (sof b); J.Number (sof c); - J.Number (sof d); J.String "v"] - | O.Op_y (a, b, c, d) -> - J.Array - [J.Number (sof a); J.Number (sof b); J.Number (sof c); - J.Number (sof d); J.String "y"] - | O.Op_Tc c -> J.Array [J.Number (sof c); J.String "Tc"] - | O.Op_Tw w -> J.Array [J.Number (sof w); J.String "Tw"] - | O.Op_Tz z -> J.Array [J.Number (sof z); J.String "Tz"] - | O.Op_TL l -> J.Array [J.Number (sof l); J.String "TL"] - | O.Op_Tf (k, s) -> J.Array [J.String k; J.Number (sof s); J.String "Tf"] - | O.Op_Tr i -> J.Array [J.Number (soi i); J.String "Tr"] - | O.Op_Ts k -> J.Array [J.Number (sof k); J.String "Ts"] - | O.Op_Td (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "Td"] - | O.Op_TD (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "TD"] - | O.Op_Tm t -> - J.Array - [J.Number (sof t.Pdftransform.a); J.Number (sof t.Pdftransform.b); J.Number (sof t.Pdftransform.c); - J.Number (sof t.Pdftransform.d); J.Number (sof t.Pdftransform.e); J.Number (sof t.Pdftransform.f); - J.String "Tm"] - | O.Op_Tj s -> J.Array [J.String s; J.String "Tj"] - | O.Op_TJ pdfobject -> J.Array [json_of_object pdf (fun _ -> ()) no_stream_data pdfobject; J.String "TJ"] - | O.Op_' s -> J.Array [J.String s; J.String "'"] - | O.Op_'' (k, k', s) -> J.Array [J.Number (sof k); J.Number (sof k'); J.String s; J.String "''"] - | O.Op_d0 (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "d0"] - | O.Op_d1 (a, b, c, d, e, k) -> - J.Array - [J.Number (sof a); J.Number (sof b); J.Number (sof c); - J.Number (sof d); J.Number (sof e); J.Number (sof k); J.String "d1"] - | O.Op_cs s -> J.Array [J.String s; J.String "cs"] - | O.Op_SC fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "SC"]) - | O.Op_sc fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "sc"]) - | O.Op_scn fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "scn"]) - | O.Op_G k -> J.Array [J.Number (sof k); J.String "G"] - | O.Op_g k -> J.Array [J.Number (sof k); J.String "g"] - | O.Op_RG (r, g, b) -> J.Array [J.Number (sof r); J.Number (sof g); J.Number (sof b); J.String "RG"] - | O.Op_rg (r, g, b) -> J.Array [J.Number (sof r); J.Number (sof g); J.Number (sof b); J.String "rg"] - | O.Op_K (c, m, y, k) -> J.Array [J.Number (sof c); J.Number (sof m); J.Number (sof y); J.Number (sof k); J.String "K"] - | O.Op_sh s -> J.Array [J.String s; J.String "sh"] - | O.Op_MP s -> J.Array [J.String s; J.String "MP"] - | O.Op_BMC s -> J.Array [J.String s; J.String "BMC"] - | O.Op_Unknown s -> J.Array [J.String s; J.String "Unknown"] - | O.Op_SCNName (s, fs) -> - J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "SCNName"]) - | O.Op_scnName (s, fs) -> - J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "scnName"]) - | O.InlineImage (dict, data) -> J.Array [json_of_object pdf (fun _ -> ()) no_stream_data dict; J.String (Pdfio.string_of_bytes data); J.String "InlineImage"] - | O.Op_DP (s, obj) -> J.Array [J.String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; J.String "DP"] -(* parse_stream needs pdf and resources. These are for lexing of inline images, - * looking up the colourspace. We do not need to worry about inherited - * resources, though? For now, don't worry about inherited resources: check in - * PDF standard. *) - -let parse_content_stream pdf resources bs = - let ops = O.parse_stream pdf resources [bs] in - J.Array (map (json_of_op pdf false) ops) - -(* We need to make sure each page only has one page content stream. Otherwise, - if not split on op boundaries, each one would fail to parse on its own. *) -(* Future improvement. Don't blow up shared content streams. *) -let precombine_page_content pdf = - let pages' = - map - (fun page -> - match page.Pdfpage.content with - [] | [_] -> page - | _ -> - let operators = - Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content - in - {page with Pdfpage.content = [Pdfops.stream_of_ops operators]} - ) - (Pdfpage.pages_of_pagetree pdf) - in - Pdfpage.change_pages true pdf pages' - -let json_of_pdf parse_content no_stream_data decompress_streams pdf = - let pdf = if parse_content then precombine_page_content pdf else pdf in - if decompress_streams then - Pdf.objiter - (fun n obj -> - Printf.eprintf "obj %i\n" n; - match obj with - | Pdf.Stream _ -> Printf.eprintf "decompressing...\n"; Pdfcodec.decode_pdfstream_until_unknown pdf obj - | _ -> ()) - pdf; - Pdf.remove_unreferenced pdf; - let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data pdf.P.trailerdict) in - let parameters = - (-1, json_of_object pdf (fun x -> ()) false - (Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2); - ("/CPDFJSONcontentparsed", Pdf.Boolean parse_content); - ("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data)); - ("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major); - ("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor); - ])) - in - let content_streams = ref [] in - let fcs n = - content_streams := n::!content_streams; - if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n) - in - let pairs = - let ps = ref [] in - P.objiter - (fun i pdfobj -> - ps := (i, json_of_object pdf fcs no_stream_data pdfobj)::!ps) - pdf; - parameters::trailerdict::!ps - in - let pairs_parsed = - if not parse_content then pairs else - map - (fun (objnum, obj) -> - if mem objnum !content_streams then - begin match obj with - | J.Object ["S", J.Array [dict; J.String _]] -> - (* FIXME Proper resources here for reasons explained above? *) - let streamdata = - match P.lookup_obj pdf objnum with - | P.Stream {contents = (_, P.Got b)} -> b - | _ -> error "JSON: stream not decoded" - in - (objnum, J.Object ["S", J.Array [dict; parse_content_stream pdf (P.Dictionary []) streamdata]]) - | _ -> error "json_of_pdf: stream parsing inconsistency" - end - else - (objnum, obj)) - pairs - in - J.Array - (map - (fun (objnum, jsonobj) -> J.Array [J.Number (soi objnum); jsonobj]) - pairs_parsed) let pdf_of_json json = (*flprint (J.show json); flprint "\n";*) @@ -427,7 +192,250 @@ let to_output o parse_content no_stream_data decompress_streams pdf = o.Pdfio.output_string (Buffer.contents b) *) -let to_output _ _ _ _ _ = () +module J = Cpdfyojson +module P = Pdf +module O = Pdfops + +let rec json_of_object pdf fcs no_stream_data = function + | P.Null -> `Null + | P.Boolean b -> `Bool b + | P.Integer i -> `Assoc [("I", `Int i)] + | P.Real r -> `Assoc [("F", `Float r)] + | P.String s -> `String s + | P.Name n -> `Assoc [("N", `String n)] + | P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data) objs) + | P.Dictionary elts -> + iter + (function + ("/Contents", P.Indirect i) -> + begin match Pdf.lookup_obj pdf i with + | Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is + | _ -> fcs i + end + | ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts + | _ -> ()) + elts; + `Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data v)) elts) + | P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream -> + P.getstream thestream; + let str = + begin match P.lookup_direct pdf "/FunctionType" d with + | Some _ -> + Pdfcodec.decode_pdfstream_until_unknown pdf thestream; + begin match !mut with (_, P.Got b) -> Pdfio.string_of_bytes b | _ -> "failure: decomp" end + | None -> + if no_stream_data then "<>" else + match stream with P.Got b -> Pdfio.string_of_bytes b | P.ToGet _ -> "failure: toget" + end + in + json_of_object pdf fcs no_stream_data (P.Dictionary [("S", P.Array [P.Dictionary dict; P.String str])]) + | P.Stream _ -> error "error: stream with not-a-dictionary" + | P.Indirect i -> + begin match P.lookup_obj pdf i with + | P.Stream {contents = (P.Dictionary dict as d, _)} -> + begin match P.lookup_direct pdf "/Subtype" d with + | Some (P.Name "/Form") -> fcs i + | _ -> () + end + | _ -> () + end; + `Int i + +let json_of_op pdf no_stream_data = function + | O.Op_S -> `List [`String "S"] + | O.Op_s -> `List [`String "s"] + | O.Op_f -> `List [`String "f"] + | O.Op_F -> `List [`String "F"] + | O.Op_f' ->`List [`String "f*"] + | O.Op_B -> `List [`String "B"] + | O.Op_B' -> `List [`String "B*"] + | O.Op_b -> `List [`String "b"] + | O.Op_b' -> `List [`String "b*"] + | O.Op_n -> `List [`String "n"] + | O.Op_W -> `List [`String "W"] + | O.Op_W' -> `List [`String "W*"] + | O.Op_BT -> `List [`String "BT"] + | O.Op_ET -> `List [`String "ET"] + | O.Op_q -> `List [`String "q"] + | O.Op_Q -> `List [`String "Q"] + | O.Op_h -> `List [`String "h"] + | O.Op_T' -> `List [`String "T*"] + | O.Op_EMC -> `List [`String "EMC"] + | O.Op_BX -> `List [`String "BX"] + | O.Op_EX -> `List [`String "EX"] + | O.Op_re (a, b, c, d) -> + `List [`Float a; `Float b; `Float c; `Float d; `String "re"] + | O.Op_k (c, m, y, k) -> + `List [`Float c; `Float m; `Float y; `Float k; `String "k"] + | O.Op_m (a, b) -> `List [`Float a; `Float b; `String "m"] + | O.Op_l (a, b) -> `List [`Float a; `Float b; `String "l"] + | O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; `String "BDC"] + | O.Op_gs s -> `List [`String s; `String "gs"] + | O.Op_Do s -> `List [`String s; `String "Do"] + | O.Op_CS s -> `List [`String s; `String "CS"] + | O.Op_SCN fs -> `List ((map (fun x -> `Float x) fs) @ [`String "SCN"]) + | O.Op_j j -> `List [`Int j; `String "j"] + | O.Op_cm t -> + `List + [`Float t.Pdftransform.a; `Float t.Pdftransform.b; `Float t.Pdftransform.c; + `Float t.Pdftransform.d; `Float t.Pdftransform.e; `Float t.Pdftransform.f; + `String "cm"] + | O.Op_d (fl, y) -> + `List [`List (map (fun x -> `Float x) fl); `Float y; `String "d"] + | O.Op_w w -> `List [`Float w; `String "w"] + | O.Op_J j -> `List [`Int j; `String "J"] + | O.Op_M m -> `List [`Float m; `String "M"] + | O.Op_ri s -> `List [`String s; `String "ri"] + | O.Op_i i -> `List [`Int i; `String "i"] + | O.Op_c (a, b, c, d, e, k) -> + `List + [`Float a; `Float b; `Float c; + `Float d; `Float e; `Float k; `String "c"] + | O.Op_v (a, b, c, d) -> + `List + [`Float a; `Float b; `Float c; + `Float d; `String "v"] + | O.Op_y (a, b, c, d) -> + `List + [`Float a; `Float b; `Float c; + `Float d; `String "y"] + | O.Op_Tc c -> `List [`Float c; `String "Tc"] + | O.Op_Tw w -> `List [`Float w; `String "Tw"] + | O.Op_Tz z -> `List [`Float z; `String "Tz"] + | O.Op_TL l -> `List [`Float l; `String "TL"] + | O.Op_Tf (k, s) -> `List [`String k; `Float s; `String "Tf"] + | O.Op_Tr i -> `List [`Int i; `String "Tr"] + | O.Op_Ts k -> `List [`Float k; `String "Ts"] + | O.Op_Td (k, k') -> `List [`Float k; `Float k'; `String "Td"] + | O.Op_TD (k, k') -> `List [`Float k; `Float k'; `String "TD"] + | O.Op_Tm t -> + `List + [`Float t.Pdftransform.a; `Float t.Pdftransform.b; `Float t.Pdftransform.c; + `Float t.Pdftransform.d; `Float t.Pdftransform.e; `Float t.Pdftransform.f; + `String "Tm"] + | O.Op_Tj s -> `List [`String s; `String "Tj"] + | O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data pdfobject; `String "TJ"] + | O.Op_' s -> `List [`String s; `String "'"] + | O.Op_'' (k, k', s) -> `List [`Float k; `Float k'; `String s; `String "''"] + | O.Op_d0 (k, k') -> `List [`Float k; `Float k'; `String "d0"] + | O.Op_d1 (a, b, c, d, e, k) -> + `List + [`Float a; `Float b; `Float c; + `Float d; `Float e; `Float k; `String "d1"] + | O.Op_cs s -> `List [`String s; `String "cs"] + | O.Op_SC fs -> `List (map (fun x -> `Float x) fs @ [`String "SC"]) + | O.Op_sc fs -> `List (map (fun x -> `Float x) fs @ [`String "sc"]) + | O.Op_scn fs -> `List (map (fun x -> `Float x) fs @ [`String "scn"]) + | O.Op_G k -> `List [`Float k; `String "G"] + | O.Op_g k -> `List [`Float k; `String "g"] + | O.Op_RG (r, g, b) -> `List [`Float r; `Float g; `Float b; `String "RG"] + | O.Op_rg (r, g, b) -> `List [`Float r; `Float g; `Float b; `String "rg"] + | O.Op_K (c, m, y, k) -> `List [`Float c; `Float m; `Float y; `Float k; `String "K"] + | O.Op_sh s -> `List [`String s; `String "sh"] + | O.Op_MP s -> `List [`String s; `String "MP"] + | O.Op_BMC s -> `List [`String s; `String "BMC"] + | O.Op_Unknown s -> `List [`String s; `String "Unknown"] + | O.Op_SCNName (s, fs) -> + `List (map (fun x -> `Float x) fs @ [`String s; `String "SCNName"]) + | O.Op_scnName (s, fs) -> + `List (map (fun x -> `Float x) fs @ [`String s; `String "scnName"]) + | O.InlineImage (dict, data) -> + `List [json_of_object pdf (fun _ -> ()) no_stream_data dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"] + | O.Op_DP (s, obj) -> + `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; `String "DP"] + + +(* parse_stream needs pdf and resources. These are for lexing of inline images, + * looking up the colourspace. We do not need to worry about inherited + * resources, though? For now, don't worry about inherited resources: check in + * PDF standard. *) + +let parse_content_stream pdf resources bs = + let ops = O.parse_stream pdf resources [bs] in + `List (map (json_of_op pdf false) ops) + +(* We need to make sure each page only has one page content stream. Otherwise, + if not split on op boundaries, each one would fail to parse on its own. *) +(* Future improvement. Don't blow up shared content streams. *) +let precombine_page_content pdf = + let pages' = + map + (fun page -> + match page.Pdfpage.content with + [] | [_] -> page + | _ -> + let operators = + Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content + in + {page with Pdfpage.content = [Pdfops.stream_of_ops operators]} + ) + (Pdfpage.pages_of_pagetree pdf) + in + Pdfpage.change_pages true pdf pages' + +let json_of_pdf parse_content no_stream_data decompress_streams pdf = + let pdf = if parse_content then precombine_page_content pdf else pdf in + if decompress_streams then + Pdf.objiter + (fun n obj -> + Printf.eprintf "obj %i\n" n; + match obj with + | Pdf.Stream _ -> Printf.eprintf "decompressing...\n"; Pdfcodec.decode_pdfstream_until_unknown pdf obj + | _ -> ()) + pdf; + Pdf.remove_unreferenced pdf; + let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data pdf.P.trailerdict) in + let parameters = + (-1, json_of_object pdf (fun x -> ()) false + (Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2); + ("/CPDFJSONcontentparsed", Pdf.Boolean parse_content); + ("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data)); + ("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major); + ("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor); + ])) + in + let content_streams = ref [] in + let fcs n = + content_streams := n::!content_streams; + if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n) + in + let pairs = + let ps = ref [] in + P.objiter + (fun i pdfobj -> + ps := (i, json_of_object pdf fcs no_stream_data pdfobj)::!ps) + pdf; + parameters::trailerdict::!ps + in + let pairs_parsed = + if not parse_content then pairs else + map + (fun (objnum, obj) -> + if mem objnum !content_streams then + begin match obj with + | `Assoc ["S", `List [dict; `String _]] -> + (* FIXME Proper resources here for reasons explained above? *) + let streamdata = + match P.lookup_obj pdf objnum with + | P.Stream {contents = (_, P.Got b)} -> b + | _ -> error "JSON: stream not decoded" + in + (objnum, `Assoc ["S", `List [dict; parse_content_stream pdf (P.Dictionary []) streamdata]]) + | _ -> error "json_of_pdf: stream parsing inconsistency" + end + else + (objnum, obj)) + pairs + in + `List + (map + (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) + pairs_parsed) + +(* FIXME Proper streaming to output *) +let to_output o parse_content no_stream_data decompress_streams pdf = + let json = json_of_pdf parse_content no_stream_data decompress_streams pdf in + o.Pdfio.output_string (J.pretty_to_string json) let example_pdf = let page =