This commit is contained in:
John Whitington 2023-02-15 16:05:59 +00:00
parent d3f847314a
commit 588f265a8c
1 changed files with 16 additions and 7 deletions

View File

@ -16,9 +16,6 @@ number, and flags used when writing (which may be required when reading):
round-trip if false). round-trip if false).
o /CPDFJSONmajorpdfversion (CPDFJSON integer) o /CPDFJSONmajorpdfversion (CPDFJSON integer)
o /CPDFJSONminorpdfversion (CPDFJSON integer) o /CPDFJSONminorpdfversion (CPDFJSON integer)
o /CPDFJSONisUTF8 (Optional. Format 3. If true, strings are converted to UTF8
before conversion to JSON, and converted back to PDFDocEncoding/UTF16BE during
converstion to PDF.)
Object 0: The PDF's trailer dictionary Object 0: The PDF's trailer dictionary
@ -31,6 +28,16 @@ Objects 1..n: The PDF's objects.
o Indirect references are integers o Indirect references are integers
o Streams are {"S": [dict, data]} o Streams are {"S": [dict, data]}
o Strings are converted into JSON strings in a way which is fully reversible. o Strings are converted into JSON strings in a way which is fully reversible.
In original (utf8=false) mode, the bytes of the string in PDF representation
are converted into UTF8, rather than the string itself being converted. In
UTF8 mode (utf8=true), instead:
- If a String contains only PDFDocEncoding characters, is is converted
to UTF8, and stored as {"U" : "..."}
- If a String has a BOM and successfully converts to UTF8, it is converted
to UTF8, and stored as {"V" : "..."}
- If a String has a BOM but fails to convert, or has no BOM, it is stored
in original mode, as an unmarked string.
In all cases, this process is still reversible.
There are two subformats: parsing content streams or not. Hello World in CPDF There are two subformats: parsing content streams or not. Hello World in CPDF
JSON without parsing content streams: JSON without parsing content streams:
@ -285,12 +292,15 @@ let mkfloat f = `Assoc [("F", `Float f)]
let mkint i = `Assoc [("I", `Int i)] let mkint i = `Assoc [("I", `Int i)]
let mkname n = `Assoc [("N", `String n)] let mkname n = `Assoc [("N", `String n)]
let json_string_of_pdfstring_utf8 =
Pdftext.utf8_of_pdfdocstring
let rec json_of_object ~utf8 ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function let rec json_of_object ~utf8 ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
| P.Null -> `Null | P.Null -> `Null
| P.Boolean b -> `Bool b | P.Boolean b -> `Bool b
| P.Integer i -> mkint i | P.Integer i -> mkint i
| P.Real r -> mkfloat r | P.Real r -> mkfloat r
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else if utf8 then Pdftext.utf8_of_pdfdocstring s else s) | P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else if utf8 then json_string_of_pdfstring_utf8 s else s)
| P.Name n -> mkname n | P.Name n -> mkname n
| P.Array objs -> `List (map (json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content) objs) | P.Array objs -> `List (map (json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content) objs)
| P.Dictionary elts -> | P.Dictionary elts ->
@ -489,8 +499,7 @@ let json_of_pdf
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ()) (fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
pdf; pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
(* Not UTF8, because /ID strings are not actually in PDFDocEncoding *) let trailerdict = (0, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
let trailerdict = (0, json_of_object ~utf8:false pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
let parameters = let parameters =
(-1, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false (-1, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 3); (Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 3);