more
This commit is contained in:
parent
d3f847314a
commit
588f265a8c
23
cpdfjson.ml
23
cpdfjson.ml
|
@ -16,9 +16,6 @@ number, and flags used when writing (which may be required when reading):
|
|||
round-trip if false).
|
||||
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
|
||||
o /CPDFJSONminorpdfversion (CPDFJSON integer)
|
||||
o /CPDFJSONisUTF8 (Optional. Format 3. If true, strings are converted to UTF8
|
||||
before conversion to JSON, and converted back to PDFDocEncoding/UTF16BE during
|
||||
converstion to PDF.)
|
||||
|
||||
Object 0: The PDF's trailer dictionary
|
||||
|
||||
|
@ -30,7 +27,17 @@ Objects 1..n: The PDF's objects.
|
|||
o Names are written as {"N": "/Pages"}
|
||||
o Indirect references are integers
|
||||
o Streams are {"S": [dict, data]}
|
||||
o Strings are converted into JSON strings in a way which is fully reversible.
|
||||
o Strings are converted into JSON strings in a way which is fully reversible.
|
||||
In original (utf8=false) mode, the bytes of the string in PDF representation
|
||||
are converted into UTF8, rather than the string itself being converted. In
|
||||
UTF8 mode (utf8=true), instead:
|
||||
- If a String contains only PDFDocEncoding characters, is is converted
|
||||
to UTF8, and stored as {"U" : "..."}
|
||||
- If a String has a BOM and successfully converts to UTF8, it is converted
|
||||
to UTF8, and stored as {"V" : "..."}
|
||||
- If a String has a BOM but fails to convert, or has no BOM, it is stored
|
||||
in original mode, as an unmarked string.
|
||||
In all cases, this process is still reversible.
|
||||
|
||||
There are two subformats: parsing content streams or not. Hello World in CPDF
|
||||
JSON without parsing content streams:
|
||||
|
@ -285,12 +292,15 @@ let mkfloat f = `Assoc [("F", `Float f)]
|
|||
let mkint i = `Assoc [("I", `Int i)]
|
||||
let mkname n = `Assoc [("N", `String n)]
|
||||
|
||||
let json_string_of_pdfstring_utf8 =
|
||||
Pdftext.utf8_of_pdfdocstring
|
||||
|
||||
let rec json_of_object ~utf8 ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
|
||||
| P.Null -> `Null
|
||||
| P.Boolean b -> `Bool b
|
||||
| P.Integer i -> mkint i
|
||||
| P.Real r -> mkfloat r
|
||||
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else if utf8 then Pdftext.utf8_of_pdfdocstring s else s)
|
||||
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else if utf8 then json_string_of_pdfstring_utf8 s else s)
|
||||
| P.Name n -> mkname n
|
||||
| P.Array objs -> `List (map (json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content) objs)
|
||||
| P.Dictionary elts ->
|
||||
|
@ -489,8 +499,7 @@ let json_of_pdf
|
|||
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
|
||||
pdf;
|
||||
Pdf.remove_unreferenced pdf;
|
||||
(* Not UTF8, because /ID strings are not actually in PDFDocEncoding *)
|
||||
let trailerdict = (0, json_of_object ~utf8:false pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
|
||||
let trailerdict = (0, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
|
||||
let parameters =
|
||||
(-1, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
|
||||
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 3);
|
||||
|
|
Loading…
Reference in New Issue