This commit is contained in:
John Whitington 2023-01-16 15:03:34 +08:00
parent d5c0af5ebf
commit 0bd9853d99
6 changed files with 54 additions and 47 deletions

View File

@ -74,7 +74,7 @@ let annotations_json_page pdf page pagenum =
let annot = Pdf.direct pdf annot in let annot = Pdf.direct pdf annot in
let annot = rewrite_destinations pdf annot in let annot = rewrite_destinations pdf annot in
extra := annot::!extra; extra := annot::!extra;
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot]) `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot])
annots annots
| _ -> [] | _ -> []
@ -125,9 +125,9 @@ let postprocess_json pdf objnum_to_serial_map json =
map map
(function (function
| `List [`Int pagenum; `Int serial; jo] -> | `List [`Int pagenum; `Int serial; jo] ->
let pdfobj = Cpdfjson.object_of_json jo in let pdfobj = Cpdfjson.object_of_json ~utf8:true jo in
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed] `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed]
| _ -> assert false) | _ -> assert false)
json json
@ -149,13 +149,13 @@ let list_annotations_json range pdf =
let extra = let extra =
map map
(fun n -> (fun n ->
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)]) `List [`Int ~-n; Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
(setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra))) (setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra)))
in in
let header = let header =
`List `List
[`Int 0; [`Int 0;
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])] (Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
in in
let json = `List ([header] @ json @ extra) in let json = `List ([header] @ json @ extra) in

View File

@ -78,7 +78,7 @@ let bookmark_of_data pdf i s i' isopen optionaldest =
Pdfmarks.isopen = isopen} Pdfmarks.isopen = isopen}
let target_of_json_target pdf pagenumber target = let target_of_json_target pdf pagenumber target =
target_of_markfile_obj pdf pagenumber (Cpdfjson.object_of_json target) target_of_markfile_obj pdf pagenumber (Cpdfjson.object_of_json ~utf8:false target)
let mark_of_json pdf = function let mark_of_json pdf = function
| `Assoc [("level", `Int level); | `Assoc [("level", `Int level);
@ -176,8 +176,8 @@ let json_of_target pdf fastrefnums x =
let a = let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more) Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
in in
Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false a Cpdfjson.json_of_object pdf (fun _ -> ()) ~utf8:false ~no_stream_data:false ~parse_content:false a
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false x | x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~utf8:false ~no_stream_data:false ~parse_content:false x
let output_json_marks output calculate_page_number pdf fastrefnums marks = let output_json_marks output calculate_page_number pdf fastrefnums marks =
let module J = Cpdfyojson.Safe in let module J = Cpdfyojson.Safe in

View File

@ -1705,14 +1705,14 @@ let setprintdictentry s =
let setreplacedictentryvalue s = let setreplacedictentryvalue s =
try try
let pdfobj = Cpdfjson.object_of_json (Cpdfyojson.Safe.from_string s) in let pdfobj = Cpdfjson.object_of_json ~utf8:false (Cpdfyojson.Safe.from_string s) in
args.replace_dict_entry_value <- pdfobj args.replace_dict_entry_value <- pdfobj
with with
e -> error (Printf.sprintf "Failed to parse replacement value: %s\n" (Printexc.to_string e)) e -> error (Printf.sprintf "Failed to parse replacement value: %s\n" (Printexc.to_string e))
let setdictentrysearch s = let setdictentrysearch s =
try try
let pdfobj = Cpdfjson.object_of_json (Cpdfyojson.Safe.from_string s) in let pdfobj = Cpdfjson.object_of_json ~utf8:false (Cpdfyojson.Safe.from_string s) in
args.dict_entry_search <- Some pdfobj args.dict_entry_search <- Some pdfobj
with with
e -> error (Printf.sprintf "Failed to parse search term: %s\n" (Printexc.to_string e)) e -> error (Printf.sprintf "Failed to parse search term: %s\n" (Printexc.to_string e))
@ -3332,6 +3332,7 @@ let write_json output pdf =
| Stdout -> | Stdout ->
Cpdfjson.to_output Cpdfjson.to_output
(Pdfio.output_of_channel stdout) (Pdfio.output_of_channel stdout)
~utf8:false
~parse_content:args.jsonparsecontentstreams ~parse_content:args.jsonparsecontentstreams
~no_stream_data:args.jsonnostreamdata ~no_stream_data:args.jsonnostreamdata
~decompress_streams:args.jsondecompressstreams ~decompress_streams:args.jsondecompressstreams
@ -3341,6 +3342,7 @@ let write_json output pdf =
let f = open_out filename in let f = open_out filename in
Cpdfjson.to_output Cpdfjson.to_output
(Pdfio.output_of_channel f) (Pdfio.output_of_channel f)
~utf8:false
~parse_content:args.jsonparsecontentstreams ~parse_content:args.jsonparsecontentstreams
~no_stream_data:args.jsonnostreamdata ~no_stream_data:args.jsonnostreamdata
~decompress_streams:args.jsondecompressstreams ~decompress_streams:args.jsondecompressstreams

View File

@ -102,7 +102,7 @@ let opi = function
| `Assoc ["I", `Float f] -> int_of_float f | `Assoc ["I", `Float f] -> int_of_float f
| _ -> error "num: not an integer" | _ -> error "num: not an integer"
let rec op_of_json = function let rec op_of_json utf8 = function
| `List [`String "S"] -> O.Op_S | `List [`String "S"] -> O.Op_S
| `List [`String "s"] -> O.Op_s | `List [`String "s"] -> O.Op_s
| `List [`String "f"] -> O.Op_f | `List [`String "f"] -> O.Op_f
@ -128,7 +128,7 @@ let rec op_of_json = function
| `List [a; b; c; d; `String "k"] -> O.Op_k (opf a, opf b, opf c, opf d) | `List [a; b; c; d; `String "k"] -> O.Op_k (opf a, opf b, opf c, opf d)
| `List [a; b; `String "m"] -> O.Op_m (opf a, opf b) | `List [a; b; `String "m"] -> O.Op_m (opf a, opf b)
| `List [a; b; `String "l"] -> O.Op_l (opf a, opf b) | `List [a; b; `String "l"] -> O.Op_l (opf a, opf b)
| `List [`String s; obj; `String "BDC"] -> O.Op_BDC (s, object_of_json obj) | `List [`String s; obj; `String "BDC"] -> O.Op_BDC (s, object_of_json ~utf8 obj)
| `List [`String s; `String "gs"] -> O.Op_gs s | `List [`String s; `String "gs"] -> O.Op_gs s
| `List [`String s; `String "Do"] -> O.Op_Do s | `List [`String s; `String "Do"] -> O.Op_Do s
| `List [`String s; `String "CS"] -> O.Op_CS s | `List [`String s; `String "CS"] -> O.Op_CS s
@ -160,7 +160,7 @@ let rec op_of_json = function
{Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c; {Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c;
Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f} Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f}
| `List [`String s; `String "Tj"] -> Op_Tj s | `List [`String s; `String "Tj"] -> Op_Tj s
| `List [obj; `String "TJ"] -> Op_TJ (object_of_json obj) | `List [obj; `String "TJ"] -> Op_TJ (object_of_json ~utf8 obj)
| `List [`String s; `String "'"] -> Op_' s | `List [`String s; `String "'"] -> Op_' s
| `List [a; b; `String s; `String "''"] -> Op_'' (opf a, opf b, s) | `List [a; b; `String s; `String "''"] -> Op_'' (opf a, opf b, s)
| `List [a; b; `String "d0"] -> Op_d0 (opf a, opf b) | `List [a; b; `String "d0"] -> Op_d0 (opf a, opf b)
@ -175,9 +175,9 @@ let rec op_of_json = function
| `List [`String s; `String "MP"] -> Op_MP s; | `List [`String s; `String "MP"] -> Op_MP s;
| `List [`String s; `String "BMC"] -> Op_BMC s; | `List [`String s; `String "BMC"] -> Op_BMC s;
| `List [`String s; `String "Unknown"] -> O.Op_Unknown s | `List [`String s; `String "Unknown"] -> O.Op_Unknown s
| `List [`String s; obj; `String "DP"] -> O.Op_DP (s, object_of_json obj) | `List [`String s; obj; `String "DP"] -> O.Op_DP (s, object_of_json ~utf8 obj)
| `List [a; `String b; `String "InlineImage"] -> | `List [a; `String b; `String "InlineImage"] ->
O.InlineImage (object_of_json a, Pdfio.bytes_of_string b) O.InlineImage (object_of_json ~utf8 a, Pdfio.bytes_of_string b)
| `List torev -> | `List torev ->
begin match rev torev with begin match rev torev with
| `String "SCN"::ns -> O.Op_SCN (map opf (rev ns)) | `String "SCN"::ns -> O.Op_SCN (map opf (rev ns))
@ -194,47 +194,52 @@ let rec op_of_json = function
Printf.eprintf "Unable to read op from %s\n" (J.show j); Printf.eprintf "Unable to read op from %s\n" (J.show j);
error "op reading failed" error "op reading failed"
and object_of_json ?(utf8=false) = function and object_of_json ~utf8 = function
| `Null -> P.Null | `Null -> P.Null
| `Bool b -> P.Boolean b | `Bool b -> P.Boolean b
| `Int n -> Pdf.Indirect n | `Int n -> Pdf.Indirect n
| `String s -> P.String s | `String s -> P.String s
| `List objs -> P.Array (map object_of_json objs) | `List objs -> P.Array (map (object_of_json ~utf8) objs)
| `Assoc ["I", `Int i] -> P.Integer i | `Assoc ["I", `Int i] -> P.Integer i
| `Assoc ["F", `Float f] -> P.Real f | `Assoc ["F", `Float f] -> P.Real f
| `Assoc ["N", `String n] -> P.Name n | `Assoc ["N", `String n] -> P.Name n
| `Assoc ["S", `List [dict; `String data]] -> | `Assoc ["S", `List [dict; `String data]] ->
let d' = let d' =
P.add_dict_entry (object_of_json dict) "/Length" (P.Integer (String.length data)) P.add_dict_entry (object_of_json ~utf8 dict) "/Length" (P.Integer (String.length data))
in in
P.Stream (ref (d', P.Got (Pdfio.bytes_of_string data))) P.Stream (ref (d', P.Got (Pdfio.bytes_of_string data)))
| `Assoc ["S", `List [dict; `List parsed_ops]] -> | `Assoc ["S", `List [dict; `List parsed_ops]] ->
begin match begin match
Pdfops.stream_of_ops (List.map op_of_json parsed_ops) Pdfops.stream_of_ops (List.map (op_of_json utf8) parsed_ops)
with with
| P.Stream {contents = (_, Pdf.Got data)} -> | P.Stream {contents = (_, Pdf.Got data)} ->
let d' = let d' =
P.add_dict_entry (object_of_json dict) "/Length" (P.Integer (Pdfio.bytes_size data)) P.add_dict_entry (object_of_json ~utf8 dict) "/Length" (P.Integer (Pdfio.bytes_size data))
in in
P.Stream (ref (d', Pdf.Got data)) P.Stream (ref (d', Pdf.Got data))
| _ -> assert false | _ -> assert false
end end
| `Assoc elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json o)) elts) | `Assoc elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json ~utf8 o)) elts)
| _ -> error "not recognised in object_of_json" | _ -> error "not recognised in object_of_json"
let pdf_of_json json = let pdf_of_json json =
let objs = match json with `List objs -> objs | _ -> error "bad json top level" in let objs = match json with `List objs -> objs | _ -> error "bad json top level" in
let params = ref Pdf.Null in let params = ref Pdf.Null in
let utf8 = ref false in
let read_utf8 () =
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONisUTF8" !params with
Some (Pdf.Boolean b) -> utf8 := b | _ -> ()
in
let trailerdict = ref Pdf.Null in let trailerdict = ref Pdf.Null in
let objects = let objects =
option_map option_map
(function (function
| `List [`Int objnum; o] -> | `List [`Int objnum; o] ->
begin match objnum with begin match objnum with
| -1 -> params := object_of_json o; None | -1 -> params := object_of_json ~utf8:false o; read_utf8 (); None
| 0 -> trailerdict := object_of_json o; None | 0 -> trailerdict := object_of_json ~utf8:!utf8 o; None
| n when n < 0 -> None | n when n < 0 -> None
| n -> Some (n, object_of_json o) | n -> Some (n, object_of_json ~utf8:!utf8 o)
end end
| _ -> error "json bad obj") | _ -> error "json bad obj")
objs objs
@ -278,14 +283,14 @@ let mkfloat f = `Assoc [("F", `Float f)]
let mkint i = `Assoc [("I", `Int i)] let mkint i = `Assoc [("I", `Int i)]
let mkname n = `Assoc [("N", `String n)] let mkname n = `Assoc [("N", `String n)]
let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function let rec json_of_object ~utf8 ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
| P.Null -> `Null | P.Null -> `Null
| P.Boolean b -> `Bool b | P.Boolean b -> `Bool b
| P.Integer i -> mkint i | P.Integer i -> mkint i
| P.Real r -> mkfloat r | P.Real r -> mkfloat r
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s) | P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
| P.Name n -> mkname n | P.Name n -> mkname n
| P.Array objs -> `List (map (json_of_object pdf fcs ~no_stream_data ~parse_content) objs) | P.Array objs -> `List (map (json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content) objs)
| P.Dictionary elts -> | P.Dictionary elts ->
iter iter
(function (function
@ -297,7 +302,7 @@ let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_d
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts | ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
| _ -> ()) | _ -> ())
elts; elts;
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs ~no_stream_data ~parse_content v)) elts) `Assoc (map (fun (k, v) -> (k, json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content v)) elts)
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream -> | P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
P.getstream thestream; P.getstream thestream;
let str, dict' = let str, dict' =
@ -310,7 +315,7 @@ let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_d
if no_stream_data then ("<<stream data elided>>", d) else if no_stream_data then ("<<stream data elided>>", d) else
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget" match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
in in
json_of_object pdf fcs ~no_stream_data ~parse_content (P.Dictionary [("S", P.Array [dict'; P.String str])]) json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content (P.Dictionary [("S", P.Array [dict'; P.String str])])
| P.Stream _ -> error "error: stream with not-a-dictionary" | P.Stream _ -> error "error: stream with not-a-dictionary"
| P.Indirect i -> | P.Indirect i ->
begin match P.lookup_obj pdf i with begin match P.lookup_obj pdf i with
@ -327,7 +332,7 @@ let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_d
end; end;
`Int i `Int i
let json_of_op pdf no_stream_data = function let json_of_op utf8 pdf no_stream_data = function
| O.Op_S -> `List [`String "S"] | O.Op_S -> `List [`String "S"]
| O.Op_s -> `List [`String "s"] | O.Op_s -> `List [`String "s"]
| O.Op_f -> `List [`String "f"] | O.Op_f -> `List [`String "f"]
@ -355,7 +360,7 @@ let json_of_op pdf no_stream_data = function
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"] `List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"] | O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"] | O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"] | O.Op_BDC (s, obj) -> `List [`String s; json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"]
| O.Op_gs s -> `List [`String s; `String "gs"] | O.Op_gs s -> `List [`String s; `String "gs"]
| O.Op_Do s -> `List [`String s; `String "Do"] | O.Op_Do s -> `List [`String s; `String "Do"]
| O.Op_CS s -> `List [`String s; `String "CS"] | O.Op_CS s -> `List [`String s; `String "CS"]
@ -400,7 +405,7 @@ let json_of_op pdf no_stream_data = function
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f; mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
`String "Tm"] `String "Tm"]
| O.Op_Tj s -> `List [`String s; `String "Tj"] | O.Op_Tj s -> `List [`String s; `String "Tj"]
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"] | O.Op_TJ pdfobject -> `List [json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"]
| O.Op_' s -> `List [`String s; `String "'"] | O.Op_' s -> `List [`String s; `String "'"]
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"] | O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"] | O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
@ -426,15 +431,15 @@ let json_of_op pdf no_stream_data = function
| O.Op_scnName (s, fs) -> | O.Op_scnName (s, fs) ->
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"]) `List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
| O.InlineImage (dict, data) -> | O.InlineImage (dict, data) ->
`List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"] `List [json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
| O.Op_DP (s, obj) -> | O.Op_DP (s, obj) ->
`List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"] `List [`String s; json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"]
(* parse_stream needs pdf and resources. These are for lexing of inline images, (* parse_stream needs pdf and resources. These are for lexing of inline images,
* looking up the colourspace. *) * looking up the colourspace. *)
let parse_content_stream pdf resources bs = let parse_content_stream utf8 pdf resources bs =
let ops = O.parse_stream pdf resources [bs] in let ops = O.parse_stream pdf resources [bs] in
`List (map (json_of_op pdf false) ops) `List (map (json_of_op utf8 pdf false) ops)
(* Make sure each page only has one page content stream. Otherwise, (* Make sure each page only has one page content stream. Otherwise,
if not split on op boundaries, each one would fail to parse on its own. *) if not split on op boundaries, each one would fail to parse on its own. *)
@ -470,7 +475,7 @@ let preprocess_strings pdf =
Pdf.objselfmap (ppstring_single_object pdf) pdf Pdf.objselfmap (ppstring_single_object pdf) pdf
let json_of_pdf let json_of_pdf
~parse_content ~no_stream_data ~decompress_streams ~clean_strings ~utf8 ~parse_content ~no_stream_data ~decompress_streams ~clean_strings
pdf pdf
= =
if clean_strings then preprocess_strings pdf; if clean_strings then preprocess_strings pdf;
@ -480,9 +485,9 @@ let json_of_pdf
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ()) (fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
pdf; pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
let trailerdict = (0, json_of_object pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in let trailerdict = (0, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
let parameters = let parameters =
(-1, json_of_object pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false (-1, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2); (Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content); ("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data)); ("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
@ -499,7 +504,7 @@ let json_of_pdf
let ps = ref [] in let ps = ref [] in
P.objiter P.objiter
(fun i pdfobj -> (fun i pdfobj ->
ps := (i, json_of_object pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps) ps := (i, json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps)
pdf; pdf;
parameters::trailerdict::sort compare !ps parameters::trailerdict::sort compare !ps
in in
@ -522,7 +527,7 @@ let json_of_pdf
| _ -> assert false | _ -> assert false
in in
(objnum, (objnum,
`Assoc ["S", `List [dict; parse_content_stream pdf (P.Dictionary []) streamdata]]) `Assoc ["S", `List [dict; parse_content_stream utf8 pdf (P.Dictionary []) streamdata]])
| _ -> error "json_of_pdf: stream parsing inconsistency" | _ -> error "json_of_pdf: stream parsing inconsistency"
end end
else else
@ -534,8 +539,8 @@ let json_of_pdf
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
pairs_parsed) pairs_parsed)
let to_output o ?(utf8=false) ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf = let to_output o ~utf8 ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in let json = json_of_pdf ~utf8 ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
match o.Pdfio.out_caml_channel with match o.Pdfio.out_caml_channel with
| Some ch -> J.pretty_to_channel ch json | Some ch -> J.pretty_to_channel ch json
| None -> o.Pdfio.output_string (J.pretty_to_string json) | None -> o.Pdfio.output_string (J.pretty_to_string json)

View File

@ -4,7 +4,7 @@
streams, [no_stream_data] will omit stream data, [decompress_streams] streams, [no_stream_data] will omit stream data, [decompress_streams]
decompresses all streams, [clean_strings] will convert any UTF16BE strings decompresses all streams, [clean_strings] will convert any UTF16BE strings
to PDFDocEncoding if it can. *) to PDFDocEncoding if it can. *)
val to_output : Pdfio.output -> ?utf8:bool -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit val to_output : Pdfio.output -> utf8:bool -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
(** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *) (** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *)
val of_input : Pdfio.input -> Pdf.t val of_input : Pdfio.input -> Pdf.t
@ -13,7 +13,7 @@ val of_input : Pdfio.input -> Pdf.t
Then the PDF file, then a function which is usually [function _ -> ()], then Then the PDF file, then a function which is usually [function _ -> ()], then
[no_stream_data] as above, then [parse_content_streams] as above, and [no_stream_data] as above, then [parse_content_streams] as above, and
finally the object itself. *) finally the object itself. *)
val json_of_object : ?utf8:bool -> ?clean_strings:bool -> Pdf.t -> (int -> unit) -> no_stream_data:bool -> parse_content:bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t val json_of_object : utf8:bool -> ?clean_strings:bool -> Pdf.t -> (int -> unit) -> no_stream_data:bool -> parse_content:bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
(** Convert a single CPDFJSON object to a PDF object *) (** Convert a single CPDFJSON object to a PDF object *)
val object_of_json : ?utf8:bool -> Cpdfyojson.Safe.t -> Pdf.pdfobject val object_of_json : utf8:bool -> Cpdfyojson.Safe.t -> Pdf.pdfobject

View File

@ -269,7 +269,7 @@ let print_dict_entry pdf key =
match Pdf.lookup_direct pdf key d with match Pdf.lookup_direct pdf key d with
| Some v -> | Some v ->
(* We use a double newline as a separator. *) (* We use a double newline as a separator. *)
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v)); Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~utf8:false ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v));
d d
| None -> d | None -> d
in in
@ -281,7 +281,7 @@ let get_dict_entries pdf key =
let es = ref [] in let es = ref [] in
let f d = let f d =
match Pdf.lookup_direct pdf key d with match Pdf.lookup_direct pdf key d with
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d | Some v -> es := Cpdfjson.json_of_object ~utf8:false ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d
| None -> d | None -> d
in in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf; Pdf.objselfmap (dict_entry_single_object f pdf) pdf;