2021-10-15 15:50:08 +02:00
|
|
|
|
(* Read and write PDF files in JSON format.
|
|
|
|
|
|
2023-01-17 06:54:23 +01:00
|
|
|
|
Format version 3: adds UTF8 option for strings for easier editing
|
|
|
|
|
Format version 2: adds object -1 with format data, roundtripping
|
|
|
|
|
Format version 1: no format specifier, output only
|
2023-01-16 07:29:54 +01:00
|
|
|
|
|
2021-10-15 15:50:08 +02:00
|
|
|
|
The file is an array of arrays containing an object number followed by an
|
|
|
|
|
object, one for each object in the file and two special ones:
|
|
|
|
|
|
|
|
|
|
Object -1: CPDF's own data with the PDF version number, CPDF JSON format
|
|
|
|
|
number, and flags used when writing (which may be required when reading):
|
|
|
|
|
|
2023-01-16 07:29:54 +01:00
|
|
|
|
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 3)
|
2021-10-15 15:50:08 +02:00
|
|
|
|
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
|
|
|
|
|
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
|
|
|
|
|
round-trip if false).
|
2021-10-29 19:17:18 +02:00
|
|
|
|
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
|
|
|
|
|
o /CPDFJSONminorpdfversion (CPDFJSON integer)
|
2021-10-15 15:50:08 +02:00
|
|
|
|
|
|
|
|
|
Object 0: The PDF's trailer dictionary
|
|
|
|
|
|
|
|
|
|
Objects 1..n: The PDF's objects.
|
|
|
|
|
|
|
|
|
|
o PDF arrays, dictionaries, booleans, and strings are the same in JSON.
|
|
|
|
|
o Integers are written as {"I": 0}
|
|
|
|
|
o Floats are written as {"F": 0.0}
|
|
|
|
|
o Names are written as {"N": "/Pages"}
|
|
|
|
|
o Indirect references are integers
|
|
|
|
|
o Streams are {"S": [dict, data]}
|
2023-02-15 17:05:59 +01:00
|
|
|
|
o Strings are converted into JSON strings in a way which is fully reversible.
|
|
|
|
|
In original (utf8=false) mode, the bytes of the string in PDF representation
|
|
|
|
|
are converted into UTF8, rather than the string itself being converted. In
|
|
|
|
|
UTF8 mode (utf8=true), instead:
|
|
|
|
|
- If a String contains only PDFDocEncoding characters, is is converted
|
|
|
|
|
to UTF8, and stored as {"U" : "..."}
|
|
|
|
|
- If a String has a BOM and successfully converts to UTF8, it is converted
|
|
|
|
|
to UTF8, and stored as {"V" : "..."}
|
|
|
|
|
- If a String has a BOM but fails to convert, or has no BOM, it is stored
|
|
|
|
|
in original mode, as an unmarked string.
|
|
|
|
|
In all cases, this process is still reversible.
|
2021-10-29 19:17:18 +02:00
|
|
|
|
|
2021-10-15 15:50:08 +02:00
|
|
|
|
There are two subformats: parsing content streams or not. Hello World in CPDF
|
|
|
|
|
JSON without parsing content streams:
|
|
|
|
|
|
|
|
|
|
[
|
|
|
|
|
[
|
|
|
|
|
-1, { "/CPDFJSONformatversion": { "I": 2 },
|
|
|
|
|
"/CPDFJSONcontentparsed": false, "/CPDFJSONstreamdataincluded": true,
|
|
|
|
|
"/CPDFJSONmajorpdfversion": { "I": 1 },
|
|
|
|
|
"/CPDFJSONminorpdfversion": { "I": 1 } } ], [
|
|
|
|
|
0, { "/Size": { "I": 4 }, "/Root": 4,
|
|
|
|
|
"/ID": [ "èÎ25\u001e³/°q:OÊ°u", "èÎ25\u001e³/°q:OÊ°u" ] } ], [
|
|
|
|
|
1, { "/Type": { "N": "/Pages" }, "/Kids": [ 3 ], "/Count": { "I": 1 } } ],
|
|
|
|
|
[
|
|
|
|
|
2, {
|
|
|
|
|
"S": [
|
|
|
|
|
{ "/Length": { "I": 49 } },
|
|
|
|
|
"1 0 0 1 50 770 cm BT/F0 36 Tf(Hello, World!)Tj ET"
|
|
|
|
|
] } ], [
|
|
|
|
|
3, { "/Type": { "N": "/Page" }, "/Parent": 1,
|
|
|
|
|
"/Resources": {
|
|
|
|
|
"/Font": {
|
|
|
|
|
"/F0": {
|
|
|
|
|
"/Type": { "N": "/Font" },
|
|
|
|
|
"/Subtype": { "N": "/Type1" },
|
|
|
|
|
"/BaseFont": { "N": "/Times-Italic" }
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"/MediaBox": [
|
|
|
|
|
{ "I": 0 }, { "I": 0 }, { "F": 595.2755905510001 }, { "F": 841.88976378 }
|
|
|
|
|
], "/Rotate": { "I": 0 }, "/Contents": [ 2 ] } ], [
|
|
|
|
|
4, { "/Type": { "N": "/Catalog" }, "/Pages": 1 } ]
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
Alternative object number 2 when parsing of object streams in operation:
|
|
|
|
|
|
|
|
|
|
2, {
|
|
|
|
|
"S": [
|
|
|
|
|
{}, [
|
|
|
|
|
[
|
|
|
|
|
{ "F": 1.0 }, { "F": 0.0 }, { "F": 0.0 }, { "F": 1.0 }, { "F": 50.0 }, {
|
|
|
|
|
"F": 770.0 }, "cm" ], [ "BT" ], [ "/F0", { "F": 36.0 }, "Tf" ], [
|
|
|
|
|
"Hello, World!", "Tj" ], [ "ET" ] ]
|
|
|
|
|
] } ], [
|
|
|
|
|
|
|
|
|
|
When parsing content streams:
|
|
|
|
|
|
|
|
|
|
o Each operation is an array
|
|
|
|
|
o The 'operation' for inline images is "InlineImage"
|
|
|
|
|
|
|
|
|
|
CPDF currently never preserves object streams, and only outputs unencrypted files.
|
|
|
|
|
|
2021-12-30 16:25:24 +01:00
|
|
|
|
When reloading a JSON file, CPDF knows how to correct or add /Length entries in
|
2021-10-15 15:50:08 +02:00
|
|
|
|
streams, so you need not worry about them. *)
|
|
|
|
|
|
2020-03-04 19:50:32 +01:00
|
|
|
|
open Pdfutil
|
2021-10-04 14:34:13 +02:00
|
|
|
|
open Cpdferror
|
2020-03-04 19:50:32 +01:00
|
|
|
|
|
2021-10-06 15:06:14 +02:00
|
|
|
|
module J = Cpdfyojson.Safe
|
|
|
|
|
module P = Pdf
|
|
|
|
|
module O = Pdfops
|
2020-01-31 15:46:33 +01:00
|
|
|
|
|
2021-10-01 17:59:05 +02:00
|
|
|
|
let opf = function
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `Assoc ["F", `Float f] -> f
|
|
|
|
|
| `Assoc ["F", `Int i] -> float_of_int i
|
2021-10-04 14:34:13 +02:00
|
|
|
|
| _ -> error "num: not a float"
|
2021-10-01 17:59:05 +02:00
|
|
|
|
|
|
|
|
|
let opi = function
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `Assoc ["I", `Int i] -> i
|
|
|
|
|
| `Assoc ["I", `Float f] -> int_of_float f
|
|
|
|
|
| _ -> error "num: not an integer"
|
2021-10-01 17:59:05 +02:00
|
|
|
|
|
2023-01-16 08:03:34 +01:00
|
|
|
|
let rec op_of_json utf8 = function
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List [`String "S"] -> O.Op_S
|
|
|
|
|
| `List [`String "s"] -> O.Op_s
|
|
|
|
|
| `List [`String "f"] -> O.Op_f
|
|
|
|
|
| `List [`String "F"] -> O.Op_F
|
|
|
|
|
| `List [`String "f*"] -> O.Op_f'
|
|
|
|
|
| `List [`String "B"] -> O.Op_B
|
|
|
|
|
| `List [`String "B*"] -> O.Op_B'
|
|
|
|
|
| `List [`String "b"] -> O.Op_b
|
|
|
|
|
| `List [`String "b*"] -> O.Op_b'
|
|
|
|
|
| `List [`String "n"] -> O.Op_n
|
|
|
|
|
| `List [`String "W"] -> O.Op_W
|
|
|
|
|
| `List [`String "W*"] -> O.Op_W'
|
|
|
|
|
| `List [`String "BT"] -> O.Op_BT
|
|
|
|
|
| `List [`String "ET"] -> O.Op_ET
|
|
|
|
|
| `List [`String "q"] -> O.Op_q
|
|
|
|
|
| `List [`String "Q"] -> O.Op_Q
|
|
|
|
|
| `List [`String "h"] -> O.Op_h
|
|
|
|
|
| `List [`String "T*"] -> O.Op_T'
|
|
|
|
|
| `List [`String "EMC"] -> O.Op_EMC
|
|
|
|
|
| `List [`String "BX"] -> O.Op_BX
|
|
|
|
|
| `List [`String "EX"] -> O.Op_EX
|
|
|
|
|
| `List [a; b; c; d; `String "re"] -> O.Op_re (opf a, opf b, opf c, opf d)
|
|
|
|
|
| `List [a; b; c; d; `String "k"] -> O.Op_k (opf a, opf b, opf c, opf d)
|
|
|
|
|
| `List [a; b; `String "m"] -> O.Op_m (opf a, opf b)
|
|
|
|
|
| `List [a; b; `String "l"] -> O.Op_l (opf a, opf b)
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| `List [`String s; obj; `String "BDC"] -> O.Op_BDC (s, object_of_json ~utf8 obj)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List [`String s; `String "gs"] -> O.Op_gs s
|
|
|
|
|
| `List [`String s; `String "Do"] -> O.Op_Do s
|
|
|
|
|
| `List [`String s; `String "CS"] -> O.Op_CS s
|
|
|
|
|
| `List [i; `String "j"] -> O.Op_j (opi i)
|
|
|
|
|
| `List [a; b; c; d; e; f; `String "cm"] ->
|
2021-10-01 17:59:05 +02:00
|
|
|
|
O.Op_cm
|
2021-10-01 21:58:46 +02:00
|
|
|
|
{Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c;
|
|
|
|
|
Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f}
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List [`List fls; y; `String "d"] -> O.Op_d (map opf fls, opf y)
|
|
|
|
|
| `List [a; `String "w"] -> O.Op_w (opf a)
|
|
|
|
|
| `List [a; `String "J"] -> O.Op_J (opi a)
|
|
|
|
|
| `List [a; `String "M"] -> O.Op_M (opf a)
|
|
|
|
|
| `List [`String s; `String "ri"] -> O.Op_ri s
|
|
|
|
|
| `List [a; `String "i"] -> O.Op_i (opi a)
|
|
|
|
|
| `List [a; b; c; d; e; f; `String "c"] -> O.Op_c (opf a, opf b, opf c, opf d, opf e, opf f)
|
|
|
|
|
| `List [a; b; c; d; `String "v"] -> O.Op_v (opf a, opf b, opf c, opf d)
|
|
|
|
|
| `List [a; b; c; d; `String "y"] -> O.Op_y (opf a, opf b, opf c, opf d)
|
|
|
|
|
| `List [a; `String "Tc"] -> O.Op_Tc (opf a)
|
|
|
|
|
| `List [a; `String "Tw"] -> O.Op_Tw (opf a)
|
|
|
|
|
| `List [a; `String "Tz"] -> O.Op_Tz (opf a)
|
|
|
|
|
| `List [a; `String "TL"] -> O.Op_TL (opf a)
|
|
|
|
|
| `List [`String k; n; `String "Tf"] -> O.Op_Tf (k, opf n)
|
|
|
|
|
| `List [a; `String "Tr"] -> O.Op_Tr (opi a)
|
|
|
|
|
| `List [a; `String "Ts"] -> O.Op_Ts (opf a)
|
|
|
|
|
| `List [a; b; `String "Td"] -> O.Op_Td (opf a, opf b)
|
|
|
|
|
| `List [a; b; `String "TD"] -> O.Op_TD (opf a, opf b)
|
|
|
|
|
| `List [a; b; c; d; e; f; `String "Tm"] ->
|
2021-10-01 21:58:46 +02:00
|
|
|
|
O.Op_Tm
|
|
|
|
|
{Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c;
|
|
|
|
|
Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f}
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List [`String s; `String "Tj"] -> Op_Tj s
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| `List [obj; `String "TJ"] -> Op_TJ (object_of_json ~utf8 obj)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List [`String s; `String "'"] -> Op_' s
|
|
|
|
|
| `List [a; b; `String s; `String "''"] -> Op_'' (opf a, opf b, s)
|
|
|
|
|
| `List [a; b; `String "d0"] -> Op_d0 (opf a, opf b)
|
|
|
|
|
| `List [a; b; c; d; e; f; `String "d1"] -> Op_d1 (opf a, opf b, opf c, opf d, opf e, opf f)
|
|
|
|
|
| `List [`String s; `String "cs"] -> Op_cs s
|
|
|
|
|
| `List [a; `String "G"] -> Op_G (opf a);
|
|
|
|
|
| `List [a; `String "g"] -> Op_g (opf a);
|
|
|
|
|
| `List [a; b; c; `String "RG"] -> Op_RG (opf a, opf b, opf c);
|
|
|
|
|
| `List [a; b; c; `String "rg"] -> Op_rg (opf a, opf b, opf c);
|
|
|
|
|
| `List [a; b; c; d; `String "K"] -> Op_K (opf a, opf b, opf c, opf d);
|
|
|
|
|
| `List [`String s; `String "sh"] -> Op_sh s;
|
|
|
|
|
| `List [`String s; `String "MP"] -> Op_MP s;
|
|
|
|
|
| `List [`String s; `String "BMC"] -> Op_BMC s;
|
|
|
|
|
| `List [`String s; `String "Unknown"] -> O.Op_Unknown s
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| `List [`String s; obj; `String "DP"] -> O.Op_DP (s, object_of_json ~utf8 obj)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List [a; `String b; `String "InlineImage"] ->
|
2023-01-16 08:03:34 +01:00
|
|
|
|
O.InlineImage (object_of_json ~utf8 a, Pdfio.bytes_of_string b)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List torev ->
|
2021-10-01 21:58:46 +02:00
|
|
|
|
begin match rev torev with
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `String "SCN"::ns -> O.Op_SCN (map opf (rev ns))
|
2021-10-12 20:52:03 +02:00
|
|
|
|
| `String "SC"::ns -> O.Op_SC (map opf (rev ns))
|
|
|
|
|
| `String "sc"::ns -> O.Op_sc (map opf (rev ns))
|
|
|
|
|
| `String "scn"::ns -> O.Op_scn (map opf (rev ns))
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `String "SCNName"::`String s::ns -> O.Op_SCNName (s, map opf (rev ns))
|
|
|
|
|
| `String "scnName"::`String s::ns -> O.Op_scnName (s, map opf (rev ns))
|
2021-10-01 21:58:46 +02:00
|
|
|
|
| j ->
|
2021-10-06 15:06:14 +02:00
|
|
|
|
Printf.eprintf "Unable to read reversed op from %s\n" (J.show (`List j));
|
2021-10-04 14:34:13 +02:00
|
|
|
|
error "op reading failed"
|
2021-10-01 21:58:46 +02:00
|
|
|
|
end
|
2021-10-01 20:41:23 +02:00
|
|
|
|
| j ->
|
|
|
|
|
Printf.eprintf "Unable to read op from %s\n" (J.show j);
|
2021-10-04 14:34:13 +02:00
|
|
|
|
error "op reading failed"
|
2021-10-01 17:59:05 +02:00
|
|
|
|
|
2023-01-16 08:03:34 +01:00
|
|
|
|
and object_of_json ~utf8 = function
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `Null -> P.Null
|
|
|
|
|
| `Bool b -> P.Boolean b
|
|
|
|
|
| `Int n -> Pdf.Indirect n
|
2023-01-17 08:08:17 +01:00
|
|
|
|
| `String s -> if utf8 then P.String (Pdftext.pdfdocstring_of_utf8 s) else P.String s
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| `List objs -> P.Array (map (object_of_json ~utf8) objs)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `Assoc ["I", `Int i] -> P.Integer i
|
|
|
|
|
| `Assoc ["F", `Float f] -> P.Real f
|
|
|
|
|
| `Assoc ["N", `String n] -> P.Name n
|
|
|
|
|
| `Assoc ["S", `List [dict; `String data]] ->
|
2021-10-05 14:08:46 +02:00
|
|
|
|
let d' =
|
2023-01-16 08:03:34 +01:00
|
|
|
|
P.add_dict_entry (object_of_json ~utf8 dict) "/Length" (P.Integer (String.length data))
|
2021-10-05 14:08:46 +02:00
|
|
|
|
in
|
|
|
|
|
P.Stream (ref (d', P.Got (Pdfio.bytes_of_string data)))
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `Assoc ["S", `List [dict; `List parsed_ops]] ->
|
2021-10-14 16:54:09 +02:00
|
|
|
|
begin match
|
2023-01-16 08:03:34 +01:00
|
|
|
|
Pdfops.stream_of_ops (List.map (op_of_json utf8) parsed_ops)
|
2021-10-14 16:54:09 +02:00
|
|
|
|
with
|
|
|
|
|
| P.Stream {contents = (_, Pdf.Got data)} ->
|
|
|
|
|
let d' =
|
2023-01-16 08:03:34 +01:00
|
|
|
|
P.add_dict_entry (object_of_json ~utf8 dict) "/Length" (P.Integer (Pdfio.bytes_size data))
|
2021-10-14 16:54:09 +02:00
|
|
|
|
in
|
|
|
|
|
P.Stream (ref (d', Pdf.Got data))
|
|
|
|
|
| _ -> assert false
|
|
|
|
|
end
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| `Assoc elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json ~utf8 o)) elts)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| _ -> error "not recognised in object_of_json"
|
2021-10-06 14:52:05 +02:00
|
|
|
|
|
|
|
|
|
let pdf_of_json json =
|
2021-10-06 15:06:14 +02:00
|
|
|
|
let objs = match json with `List objs -> objs | _ -> error "bad json top level" in
|
2021-10-06 14:52:05 +02:00
|
|
|
|
let params = ref Pdf.Null in
|
2023-01-16 08:03:34 +01:00
|
|
|
|
let utf8 = ref false in
|
|
|
|
|
let read_utf8 () =
|
|
|
|
|
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONisUTF8" !params with
|
|
|
|
|
Some (Pdf.Boolean b) -> utf8 := b | _ -> ()
|
|
|
|
|
in
|
2021-10-06 14:52:05 +02:00
|
|
|
|
let trailerdict = ref Pdf.Null in
|
|
|
|
|
let objects =
|
|
|
|
|
option_map
|
|
|
|
|
(function
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| `List [`Int objnum; o] ->
|
|
|
|
|
begin match objnum with
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| -1 -> params := object_of_json ~utf8:false o; read_utf8 (); None
|
2023-01-17 10:37:53 +01:00
|
|
|
|
| 0 -> trailerdict := object_of_json ~utf8:false o; None (* utf8 false since /IDs are not PdfDocencoding, so don't get transformed *)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
| n when n < 0 -> None
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| n -> Some (n, object_of_json ~utf8:!utf8 o)
|
2021-10-06 15:06:14 +02:00
|
|
|
|
end
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| _ -> error "json bad obj")
|
|
|
|
|
objs
|
|
|
|
|
in
|
|
|
|
|
begin match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONstreamdataincluded" !params with
|
|
|
|
|
| Some (Pdf.Boolean false) -> error "no stream data; cannot reconstruct PDF"
|
|
|
|
|
| _ -> ()
|
|
|
|
|
end;
|
|
|
|
|
let major =
|
|
|
|
|
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONmajorpdfversion" !params with
|
|
|
|
|
Some (Pdf.Integer i) -> i | _ -> error "bad major version"
|
|
|
|
|
in
|
|
|
|
|
let minor =
|
|
|
|
|
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONminorpdfversion" !params with
|
|
|
|
|
Some (Pdf.Integer i) -> i | _ -> error "bad minor version"
|
|
|
|
|
in
|
|
|
|
|
let root =
|
|
|
|
|
match !trailerdict with Pdf.Dictionary d ->
|
|
|
|
|
begin match lookup "/Root" d with
|
|
|
|
|
Some (Pdf.Indirect i) -> i | _ -> error "bad root"
|
|
|
|
|
end
|
|
|
|
|
| _ -> error "bad root 2"
|
|
|
|
|
in
|
|
|
|
|
let objmap = P.pdfobjmap_empty () in
|
2021-10-06 18:04:54 +02:00
|
|
|
|
List.iter (fun (k, v) -> Hashtbl.add objmap k (ref (P.Parsed v), 0)) objects;
|
|
|
|
|
let objects =
|
|
|
|
|
{P.maxobjnum = 0;
|
|
|
|
|
P.parse = None;
|
|
|
|
|
P.pdfobjects = objmap;
|
|
|
|
|
P.object_stream_ids = Hashtbl.create 0}
|
|
|
|
|
in
|
|
|
|
|
{P.major;
|
|
|
|
|
P.minor;
|
|
|
|
|
P.root;
|
|
|
|
|
P.objects;
|
|
|
|
|
P.trailerdict = !trailerdict;
|
|
|
|
|
P.was_linearized = false;
|
|
|
|
|
P.saved_encryption = None}
|
|
|
|
|
|
|
|
|
|
let mkfloat f = `Assoc [("F", `Float f)]
|
|
|
|
|
let mkint i = `Assoc [("I", `Int i)]
|
|
|
|
|
let mkname n = `Assoc [("N", `String n)]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
|
2023-02-15 17:05:59 +01:00
|
|
|
|
let json_string_of_pdfstring_utf8 =
|
|
|
|
|
Pdftext.utf8_of_pdfdocstring
|
|
|
|
|
|
2023-01-16 08:03:34 +01:00
|
|
|
|
let rec json_of_object ~utf8 ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| P.Null -> `Null
|
|
|
|
|
| P.Boolean b -> `Bool b
|
2021-10-06 18:04:54 +02:00
|
|
|
|
| P.Integer i -> mkint i
|
|
|
|
|
| P.Real r -> mkfloat r
|
2023-02-15 17:05:59 +01:00
|
|
|
|
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else if utf8 then json_string_of_pdfstring_utf8 s else s)
|
2021-10-06 18:04:54 +02:00
|
|
|
|
| P.Name n -> mkname n
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| P.Array objs -> `List (map (json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content) objs)
|
2021-10-03 18:31:50 +02:00
|
|
|
|
| P.Dictionary elts ->
|
|
|
|
|
iter
|
|
|
|
|
(function
|
|
|
|
|
("/Contents", P.Indirect i) ->
|
|
|
|
|
begin match Pdf.lookup_obj pdf i with
|
|
|
|
|
| Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is
|
|
|
|
|
| _ -> fcs i
|
|
|
|
|
end
|
|
|
|
|
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
|
|
|
|
|
| _ -> ())
|
|
|
|
|
elts;
|
2023-01-16 08:03:34 +01:00
|
|
|
|
`Assoc (map (fun (k, v) -> (k, json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content v)) elts)
|
2021-10-12 17:03:22 +02:00
|
|
|
|
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
|
2021-10-03 18:31:50 +02:00
|
|
|
|
P.getstream thestream;
|
2021-10-14 21:51:25 +02:00
|
|
|
|
let str, dict' =
|
2023-01-16 07:29:54 +01:00
|
|
|
|
match P.lookup_direct pdf "/FunctionType" d, parse_content with
|
2021-10-12 17:09:58 +02:00
|
|
|
|
| Some _, true ->
|
2021-10-12 17:03:22 +02:00
|
|
|
|
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
|
2021-10-14 21:51:25 +02:00
|
|
|
|
let dict = P.remove_dict_entry d "/Filter" in
|
|
|
|
|
begin match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, dict) | _ -> error "/FunctionType: failure: decomp" end
|
2021-10-12 17:09:58 +02:00
|
|
|
|
| _ ->
|
2021-10-14 21:51:25 +02:00
|
|
|
|
if no_stream_data then ("<<stream data elided>>", d) else
|
|
|
|
|
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
|
2021-10-03 18:31:50 +02:00
|
|
|
|
in
|
2023-01-17 09:47:20 +01:00
|
|
|
|
(* We don't want to allow UTF8 processing of the stream here, so generate JSON without recursion. *)
|
|
|
|
|
let dictjson = json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content dict' in
|
|
|
|
|
`Assoc [("S", `List [dictjson; `String str])]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| P.Stream _ -> error "error: stream with not-a-dictionary"
|
2021-10-03 18:31:50 +02:00
|
|
|
|
| P.Indirect i ->
|
|
|
|
|
begin match P.lookup_obj pdf i with
|
|
|
|
|
| P.Stream {contents = (P.Dictionary dict as d, _)} ->
|
|
|
|
|
begin match P.lookup_direct pdf "/Subtype" d with
|
|
|
|
|
| Some (P.Name "/Form") -> fcs i
|
2021-10-14 16:54:09 +02:00
|
|
|
|
| _ ->
|
|
|
|
|
begin match P.lookup_direct pdf "/Type" d with
|
|
|
|
|
| Some (P.Name "/Pattern") -> fcs i
|
|
|
|
|
| _ -> ()
|
|
|
|
|
end
|
2021-10-03 18:31:50 +02:00
|
|
|
|
end
|
|
|
|
|
| _ -> ()
|
|
|
|
|
end;
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`Int i
|
2021-10-03 18:31:50 +02:00
|
|
|
|
|
2023-01-16 08:03:34 +01:00
|
|
|
|
let json_of_op utf8 pdf no_stream_data = function
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.Op_S -> `List [`String "S"]
|
|
|
|
|
| O.Op_s -> `List [`String "s"]
|
|
|
|
|
| O.Op_f -> `List [`String "f"]
|
|
|
|
|
| O.Op_F -> `List [`String "F"]
|
|
|
|
|
| O.Op_f' ->`List [`String "f*"]
|
|
|
|
|
| O.Op_B -> `List [`String "B"]
|
|
|
|
|
| O.Op_B' -> `List [`String "B*"]
|
|
|
|
|
| O.Op_b -> `List [`String "b"]
|
|
|
|
|
| O.Op_b' -> `List [`String "b*"]
|
|
|
|
|
| O.Op_n -> `List [`String "n"]
|
|
|
|
|
| O.Op_W -> `List [`String "W"]
|
|
|
|
|
| O.Op_W' -> `List [`String "W*"]
|
|
|
|
|
| O.Op_BT -> `List [`String "BT"]
|
|
|
|
|
| O.Op_ET -> `List [`String "ET"]
|
|
|
|
|
| O.Op_q -> `List [`String "q"]
|
|
|
|
|
| O.Op_Q -> `List [`String "Q"]
|
|
|
|
|
| O.Op_h -> `List [`String "h"]
|
|
|
|
|
| O.Op_T' -> `List [`String "T*"]
|
|
|
|
|
| O.Op_EMC -> `List [`String "EMC"]
|
|
|
|
|
| O.Op_BX -> `List [`String "BX"]
|
|
|
|
|
| O.Op_EX -> `List [`String "EX"]
|
2020-01-30 17:07:40 +01:00
|
|
|
|
| O.Op_re (a, b, c, d) ->
|
2021-10-06 18:04:54 +02:00
|
|
|
|
`List [mkfloat a; mkfloat b; mkfloat c; mkfloat d; `String "re"]
|
2020-01-30 17:07:40 +01:00
|
|
|
|
| O.Op_k (c, m, y, k) ->
|
2021-10-06 18:04:54 +02:00
|
|
|
|
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
|
|
|
|
|
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
|
|
|
|
|
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.Op_gs s -> `List [`String s; `String "gs"]
|
|
|
|
|
| O.Op_Do s -> `List [`String s; `String "Do"]
|
|
|
|
|
| O.Op_CS s -> `List [`String s; `String "CS"]
|
2021-10-06 18:04:54 +02:00
|
|
|
|
| O.Op_SCN fs -> `List ((map (fun x -> mkfloat x) fs) @ [`String "SCN"])
|
|
|
|
|
| O.Op_j j -> `List [mkint j; `String "j"]
|
2020-01-30 17:07:40 +01:00
|
|
|
|
| O.Op_cm t ->
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`List
|
2021-10-06 18:04:54 +02:00
|
|
|
|
[mkfloat t.Pdftransform.a; mkfloat t.Pdftransform.b; mkfloat t.Pdftransform.c;
|
|
|
|
|
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`String "cm"]
|
2020-01-30 17:07:40 +01:00
|
|
|
|
| O.Op_d (fl, y) ->
|
2021-10-06 18:04:54 +02:00
|
|
|
|
`List [`List (map (fun x -> mkfloat x) fl); mkfloat y; `String "d"]
|
|
|
|
|
| O.Op_w w -> `List [mkfloat w; `String "w"]
|
|
|
|
|
| O.Op_J j -> `List [mkint j; `String "J"]
|
|
|
|
|
| O.Op_M m -> `List [mkfloat m; `String "M"]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.Op_ri s -> `List [`String s; `String "ri"]
|
2021-10-06 18:04:54 +02:00
|
|
|
|
| O.Op_i i -> `List [mkint i; `String "i"]
|
2021-10-14 21:28:43 +02:00
|
|
|
|
| O.Op_c (a, b, c, d, e, f) ->
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`List
|
2021-10-06 18:04:54 +02:00
|
|
|
|
[mkfloat a; mkfloat b; mkfloat c;
|
2021-10-14 21:28:43 +02:00
|
|
|
|
mkfloat d; mkfloat e; mkfloat f; `String "c"]
|
2020-01-31 15:24:36 +01:00
|
|
|
|
| O.Op_v (a, b, c, d) ->
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`List
|
2021-10-06 18:04:54 +02:00
|
|
|
|
[mkfloat a; mkfloat b; mkfloat c;
|
|
|
|
|
mkfloat d; `String "v"]
|
2020-01-31 15:24:36 +01:00
|
|
|
|
| O.Op_y (a, b, c, d) ->
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`List
|
2021-10-06 18:04:54 +02:00
|
|
|
|
[mkfloat a; mkfloat b; mkfloat c;
|
|
|
|
|
mkfloat d; `String "y"]
|
|
|
|
|
| O.Op_Tc c -> `List [mkfloat c; `String "Tc"]
|
|
|
|
|
| O.Op_Tw w -> `List [mkfloat w; `String "Tw"]
|
|
|
|
|
| O.Op_Tz z -> `List [mkfloat z; `String "Tz"]
|
|
|
|
|
| O.Op_TL l -> `List [mkfloat l; `String "TL"]
|
|
|
|
|
| O.Op_Tf (k, s) -> `List [`String k; mkfloat s; `String "Tf"]
|
|
|
|
|
| O.Op_Tr i -> `List [mkint i; `String "Tr"]
|
|
|
|
|
| O.Op_Ts k -> `List [mkfloat k; `String "Ts"]
|
|
|
|
|
| O.Op_Td (k, k') -> `List [mkfloat k; mkfloat k'; `String "Td"]
|
|
|
|
|
| O.Op_TD (k, k') -> `List [mkfloat k; mkfloat k'; `String "TD"]
|
2020-01-31 15:24:36 +01:00
|
|
|
|
| O.Op_Tm t ->
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`List
|
2021-10-06 18:04:54 +02:00
|
|
|
|
[mkfloat t.Pdftransform.a; mkfloat t.Pdftransform.b; mkfloat t.Pdftransform.c;
|
|
|
|
|
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`String "Tm"]
|
|
|
|
|
| O.Op_Tj s -> `List [`String s; `String "Tj"]
|
2023-01-16 08:03:34 +01:00
|
|
|
|
| O.Op_TJ pdfobject -> `List [json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.Op_' s -> `List [`String s; `String "'"]
|
2021-10-06 18:04:54 +02:00
|
|
|
|
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
|
|
|
|
|
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
|
2021-10-14 21:28:43 +02:00
|
|
|
|
| O.Op_d1 (a, b, c, d, e, f) ->
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`List
|
2021-10-06 18:04:54 +02:00
|
|
|
|
[mkfloat a; mkfloat b; mkfloat c;
|
2021-10-14 21:28:43 +02:00
|
|
|
|
mkfloat d; mkfloat e; mkfloat f; `String "d1"]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.Op_cs s -> `List [`String s; `String "cs"]
|
2021-10-06 18:04:54 +02:00
|
|
|
|
| O.Op_SC fs -> `List (map (fun x -> mkfloat x) fs @ [`String "SC"])
|
|
|
|
|
| O.Op_sc fs -> `List (map (fun x -> mkfloat x) fs @ [`String "sc"])
|
|
|
|
|
| O.Op_scn fs -> `List (map (fun x -> mkfloat x) fs @ [`String "scn"])
|
|
|
|
|
| O.Op_G k -> `List [mkfloat k; `String "G"]
|
|
|
|
|
| O.Op_g k -> `List [mkfloat k; `String "g"]
|
|
|
|
|
| O.Op_RG (r, g, b) -> `List [mkfloat r; mkfloat g; mkfloat b; `String "RG"]
|
|
|
|
|
| O.Op_rg (r, g, b) -> `List [mkfloat r; mkfloat g; mkfloat b; `String "rg"]
|
|
|
|
|
| O.Op_K (c, m, y, k) -> `List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "K"]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.Op_sh s -> `List [`String s; `String "sh"]
|
|
|
|
|
| O.Op_MP s -> `List [`String s; `String "MP"]
|
|
|
|
|
| O.Op_BMC s -> `List [`String s; `String "BMC"]
|
|
|
|
|
| O.Op_Unknown s -> `List [`String s; `String "Unknown"]
|
2020-01-31 15:46:33 +01:00
|
|
|
|
| O.Op_SCNName (s, fs) ->
|
2021-10-06 18:04:54 +02:00
|
|
|
|
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "SCNName"])
|
2020-01-31 15:46:33 +01:00
|
|
|
|
| O.Op_scnName (s, fs) ->
|
2021-10-06 18:04:54 +02:00
|
|
|
|
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.InlineImage (dict, data) ->
|
2023-01-16 08:03:34 +01:00
|
|
|
|
`List [json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| O.Op_DP (s, obj) ->
|
2023-01-16 08:03:34 +01:00
|
|
|
|
`List [`String s; json_of_object ~utf8 pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"]
|
2021-10-06 14:52:05 +02:00
|
|
|
|
|
2020-01-31 13:17:55 +01:00
|
|
|
|
(* parse_stream needs pdf and resources. These are for lexing of inline images,
|
2021-10-14 16:54:09 +02:00
|
|
|
|
* looking up the colourspace. *)
|
2023-01-16 08:03:34 +01:00
|
|
|
|
let parse_content_stream utf8 pdf resources bs =
|
2020-03-25 14:46:54 +01:00
|
|
|
|
let ops = O.parse_stream pdf resources [bs] in
|
2023-01-16 08:03:34 +01:00
|
|
|
|
`List (map (json_of_op utf8 pdf false) ops)
|
2020-01-30 14:10:03 +01:00
|
|
|
|
|
2021-10-14 18:42:00 +02:00
|
|
|
|
(* Make sure each page only has one page content stream. Otherwise,
|
2021-10-15 15:50:08 +02:00
|
|
|
|
if not split on op boundaries, each one would fail to parse on its own. *)
|
2021-10-14 21:28:43 +02:00
|
|
|
|
let precombine_page_content pdf =
|
2021-06-21 16:03:32 +02:00
|
|
|
|
let pages' =
|
|
|
|
|
map
|
|
|
|
|
(fun page ->
|
|
|
|
|
match page.Pdfpage.content with
|
|
|
|
|
[] | [_] -> page
|
|
|
|
|
| _ ->
|
|
|
|
|
let operators =
|
|
|
|
|
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
|
|
|
|
in
|
|
|
|
|
{page with Pdfpage.content = [Pdfops.stream_of_ops operators]}
|
|
|
|
|
)
|
|
|
|
|
(Pdfpage.pages_of_pagetree pdf)
|
|
|
|
|
in
|
|
|
|
|
Pdfpage.change_pages true pdf pages'
|
|
|
|
|
|
2021-12-30 16:25:24 +01:00
|
|
|
|
(* Convert any strings in UTF16BE which could actually be in PDFDocEncoding
|
|
|
|
|
(due to having no high bytes) to make editing JSON easier. *)
|
2021-10-29 19:17:18 +02:00
|
|
|
|
let rec ppstring_single_object pdf = function
|
|
|
|
|
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
|
|
|
|
|
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
|
|
|
|
|
Pdf.Stream {contents = (Pdf.recurse_dict (ppstring_single_object pdf) dict, data)}
|
|
|
|
|
| Pdf.Array a -> Pdf.recurse_array (ppstring_single_object pdf) a
|
2021-11-01 14:17:54 +01:00
|
|
|
|
| Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s)
|
2021-10-29 19:17:18 +02:00
|
|
|
|
| x -> x
|
|
|
|
|
|
2021-12-30 16:25:24 +01:00
|
|
|
|
(* Do all objects, but skip the trailer dictionary since may mess up /ID if it
|
|
|
|
|
happens to begin with UTF16BE BOM *)
|
2021-10-29 19:17:18 +02:00
|
|
|
|
let preprocess_strings pdf =
|
2023-01-17 10:37:53 +01:00
|
|
|
|
Pdf.objselfmap (ppstring_single_object pdf) pdf
|
2021-10-29 19:17:18 +02:00
|
|
|
|
|
2021-10-12 16:35:08 +02:00
|
|
|
|
let json_of_pdf
|
2023-01-16 08:03:34 +01:00
|
|
|
|
~utf8 ~parse_content ~no_stream_data ~decompress_streams ~clean_strings
|
2021-10-12 16:35:08 +02:00
|
|
|
|
pdf
|
|
|
|
|
=
|
2021-12-30 16:25:24 +01:00
|
|
|
|
if clean_strings then preprocess_strings pdf;
|
2021-10-14 21:28:43 +02:00
|
|
|
|
let pdf = if parse_content then precombine_page_content pdf else pdf in
|
2021-10-04 19:38:36 +02:00
|
|
|
|
if decompress_streams then
|
2022-01-14 16:21:54 +01:00
|
|
|
|
Pdf.objiter
|
|
|
|
|
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
|
|
|
|
|
pdf;
|
2021-06-21 16:03:32 +02:00
|
|
|
|
Pdf.remove_unreferenced pdf;
|
2023-02-15 17:05:59 +01:00
|
|
|
|
let trailerdict = (0, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
|
2021-10-01 13:16:55 +02:00
|
|
|
|
let parameters =
|
2023-01-16 08:03:34 +01:00
|
|
|
|
(-1, json_of_object ~utf8 pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
|
2023-01-17 09:47:20 +01:00
|
|
|
|
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 3);
|
2021-10-01 13:16:55 +02:00
|
|
|
|
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
|
2021-10-01 16:21:03 +02:00
|
|
|
|
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
|
2023-01-17 09:47:20 +01:00
|
|
|
|
("/CPDFJSONisUTF8", Pdf.Boolean utf8);
|
2021-10-01 16:21:03 +02:00
|
|
|
|
("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major);
|
|
|
|
|
("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor);
|
|
|
|
|
]))
|
2021-10-01 13:16:55 +02:00
|
|
|
|
in
|
2020-01-30 14:10:03 +01:00
|
|
|
|
let content_streams = ref [] in
|
2021-10-04 19:38:36 +02:00
|
|
|
|
let fcs n =
|
|
|
|
|
content_streams := n::!content_streams;
|
|
|
|
|
if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)
|
|
|
|
|
in
|
2020-01-30 14:10:03 +01:00
|
|
|
|
let pairs =
|
|
|
|
|
let ps = ref [] in
|
2020-03-25 14:46:54 +01:00
|
|
|
|
P.objiter
|
2020-01-30 14:10:03 +01:00
|
|
|
|
(fun i pdfobj ->
|
2023-01-16 08:03:34 +01:00
|
|
|
|
ps := (i, json_of_object ~utf8 pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps)
|
2020-01-30 14:10:03 +01:00
|
|
|
|
pdf;
|
2021-10-14 22:29:03 +02:00
|
|
|
|
parameters::trailerdict::sort compare !ps
|
2020-01-30 14:10:03 +01:00
|
|
|
|
in
|
|
|
|
|
let pairs_parsed =
|
|
|
|
|
if not parse_content then pairs else
|
2020-03-04 19:50:32 +01:00
|
|
|
|
map
|
2020-01-30 14:10:03 +01:00
|
|
|
|
(fun (objnum, obj) ->
|
2020-03-04 19:50:32 +01:00
|
|
|
|
if mem objnum !content_streams then
|
2020-01-30 14:10:03 +01:00
|
|
|
|
begin match obj with
|
2021-10-06 14:52:05 +02:00
|
|
|
|
| `Assoc ["S", `List [dict; `String _]] ->
|
2020-01-31 13:17:55 +01:00
|
|
|
|
let streamdata =
|
2020-03-25 14:46:54 +01:00
|
|
|
|
match P.lookup_obj pdf objnum with
|
|
|
|
|
| P.Stream {contents = (_, P.Got b)} -> b
|
2021-10-04 14:34:13 +02:00
|
|
|
|
| _ -> error "JSON: stream not decoded"
|
2020-01-31 13:17:55 +01:00
|
|
|
|
in
|
2021-10-14 18:42:00 +02:00
|
|
|
|
let dict =
|
|
|
|
|
match dict with
|
|
|
|
|
| `Assoc d ->
|
|
|
|
|
`Assoc (option_map (function (("/Filter" | "/Length"), _) -> None | (a, b) -> Some (a, b)) d)
|
|
|
|
|
| _ -> assert false
|
|
|
|
|
in
|
|
|
|
|
(objnum,
|
2023-01-16 08:03:34 +01:00
|
|
|
|
`Assoc ["S", `List [dict; parse_content_stream utf8 pdf (P.Dictionary []) streamdata]])
|
2021-10-04 14:34:13 +02:00
|
|
|
|
| _ -> error "json_of_pdf: stream parsing inconsistency"
|
2020-01-30 14:10:03 +01:00
|
|
|
|
end
|
|
|
|
|
else
|
|
|
|
|
(objnum, obj))
|
|
|
|
|
pairs
|
|
|
|
|
in
|
2021-10-06 14:52:05 +02:00
|
|
|
|
`List
|
2020-03-04 19:50:32 +01:00
|
|
|
|
(map
|
2021-10-06 14:52:05 +02:00
|
|
|
|
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
|
2020-01-30 14:10:03 +01:00
|
|
|
|
pairs_parsed)
|
2020-01-30 11:42:24 +01:00
|
|
|
|
|
2023-01-16 08:03:34 +01:00
|
|
|
|
let to_output o ~utf8 ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
|
|
|
|
|
let json = json_of_pdf ~utf8 ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
|
2021-10-12 20:52:03 +02:00
|
|
|
|
match o.Pdfio.out_caml_channel with
|
|
|
|
|
| Some ch -> J.pretty_to_channel ch json
|
|
|
|
|
| None -> o.Pdfio.output_string (J.pretty_to_string json)
|
2021-10-01 13:16:55 +02:00
|
|
|
|
|
2021-10-01 13:53:21 +02:00
|
|
|
|
let of_input i =
|
2021-10-12 20:52:03 +02:00
|
|
|
|
try
|
|
|
|
|
match i.Pdfio.caml_channel with
|
2021-10-14 16:54:09 +02:00
|
|
|
|
| Some ch ->
|
2021-10-14 16:55:56 +02:00
|
|
|
|
pdf_of_json (J.from_channel ch)
|
2021-10-12 20:52:03 +02:00
|
|
|
|
| None ->
|
2021-10-14 16:54:09 +02:00
|
|
|
|
let content = Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 i.Pdfio.in_channel_length) in
|
2021-10-12 20:52:03 +02:00
|
|
|
|
pdf_of_json (J.from_string content)
|
|
|
|
|
with
|
|
|
|
|
e -> error (Printexc.to_string e)
|