cpdf-source/cpdfjson.ml

557 lines
22 KiB
OCaml
Raw Normal View History

2021-10-15 15:50:08 +02:00
(* Read and write PDF files in JSON format.
The file is an array of arrays containing an object number followed by an
object, one for each object in the file and two special ones:
Object -1: CPDF's own data with the PDF version number, CPDF JSON format
number, and flags used when writing (which may be required when reading):
2021-10-29 19:17:18 +02:00
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2)
2021-10-15 15:50:08 +02:00
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
round-trip if false).
2021-10-29 19:17:18 +02:00
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
o /CPDFJSONminorpdfversion (CPDFJSON integer)
2021-10-15 15:50:08 +02:00
Object 0: The PDF's trailer dictionary
Objects 1..n: The PDF's objects.
o PDF arrays, dictionaries, booleans, and strings are the same in JSON.
o Integers are written as {"I": 0}
o Floats are written as {"F": 0.0}
o Names are written as {"N": "/Pages"}
o Indirect references are integers
o Streams are {"S": [dict, data]}
2021-10-29 19:17:18 +02:00
o Strings are converted from PDFDocEncoding to UTF8 before being encoded in
JSON. When they are read back the process is JSON encoded --> UTF8 -->
PDFDocEncoding. This process is to allow easier editing of strings. This
does not happen to strings within text operators in parsed content streams.
2021-10-15 15:50:08 +02:00
There are two subformats: parsing content streams or not. Hello World in CPDF
JSON without parsing content streams:
[
[
-1, { "/CPDFJSONformatversion": { "I": 2 },
"/CPDFJSONcontentparsed": false, "/CPDFJSONstreamdataincluded": true,
"/CPDFJSONmajorpdfversion": { "I": 1 },
"/CPDFJSONminorpdfversion": { "I": 1 } } ], [
0, { "/Size": { "I": 4 }, "/Root": 4,
"/ID": [ "èÎ25\u001e³/°q:Oʇ°u‰", "èÎ25\u001e³/°q:Oʇ°u‰" ] } ], [
1, { "/Type": { "N": "/Pages" }, "/Kids": [ 3 ], "/Count": { "I": 1 } } ],
[
2, {
"S": [
{ "/Length": { "I": 49 } },
"1 0 0 1 50 770 cm BT/F0 36 Tf(Hello, World!)Tj ET"
] } ], [
3, { "/Type": { "N": "/Page" }, "/Parent": 1,
"/Resources": {
"/Font": {
"/F0": {
"/Type": { "N": "/Font" },
"/Subtype": { "N": "/Type1" },
"/BaseFont": { "N": "/Times-Italic" }
}
}
},
"/MediaBox": [
{ "I": 0 }, { "I": 0 }, { "F": 595.2755905510001 }, { "F": 841.88976378 }
], "/Rotate": { "I": 0 }, "/Contents": [ 2 ] } ], [
4, { "/Type": { "N": "/Catalog" }, "/Pages": 1 } ]
]
Alternative object number 2 when parsing of object streams in operation:
2, {
"S": [
{}, [
[
{ "F": 1.0 }, { "F": 0.0 }, { "F": 0.0 }, { "F": 1.0 }, { "F": 50.0 }, {
"F": 770.0 }, "cm" ], [ "BT" ], [ "/F0", { "F": 36.0 }, "Tf" ], [
"Hello, World!", "Tj" ], [ "ET" ] ]
] } ], [
When parsing content streams:
o Each operation is an array
o The 'operation' for inline images is "InlineImage"
CPDF currently never preserves object streams, and only outputs unencrypted files.
When reloading a JSON file, CPDF knows how to correct /Length entries in
streams, so you need not worry about them. *)
2020-03-04 19:50:32 +01:00
open Pdfutil
2021-10-04 14:34:13 +02:00
open Cpdferror
2020-03-04 19:50:32 +01:00
2021-10-06 15:06:14 +02:00
module J = Cpdfyojson.Safe
module P = Pdf
module O = Pdfops
2020-01-31 15:46:33 +01:00
2021-10-01 17:59:05 +02:00
let opf = function
2021-10-06 15:06:14 +02:00
| `Assoc ["F", `Float f] -> f
| `Assoc ["F", `Int i] -> float_of_int i
2021-10-04 14:34:13 +02:00
| _ -> error "num: not a float"
2021-10-01 17:59:05 +02:00
let opi = function
2021-10-06 15:06:14 +02:00
| `Assoc ["I", `Int i] -> i
| `Assoc ["I", `Float f] -> int_of_float f
| _ -> error "num: not an integer"
2021-10-01 17:59:05 +02:00
2021-10-03 19:11:29 +02:00
let rec op_of_json = function
2021-10-06 15:06:14 +02:00
| `List [`String "S"] -> O.Op_S
| `List [`String "s"] -> O.Op_s
| `List [`String "f"] -> O.Op_f
| `List [`String "F"] -> O.Op_F
| `List [`String "f*"] -> O.Op_f'
| `List [`String "B"] -> O.Op_B
| `List [`String "B*"] -> O.Op_B'
| `List [`String "b"] -> O.Op_b
| `List [`String "b*"] -> O.Op_b'
| `List [`String "n"] -> O.Op_n
| `List [`String "W"] -> O.Op_W
| `List [`String "W*"] -> O.Op_W'
| `List [`String "BT"] -> O.Op_BT
| `List [`String "ET"] -> O.Op_ET
| `List [`String "q"] -> O.Op_q
| `List [`String "Q"] -> O.Op_Q
| `List [`String "h"] -> O.Op_h
| `List [`String "T*"] -> O.Op_T'
| `List [`String "EMC"] -> O.Op_EMC
| `List [`String "BX"] -> O.Op_BX
| `List [`String "EX"] -> O.Op_EX
| `List [a; b; c; d; `String "re"] -> O.Op_re (opf a, opf b, opf c, opf d)
| `List [a; b; c; d; `String "k"] -> O.Op_k (opf a, opf b, opf c, opf d)
| `List [a; b; `String "m"] -> O.Op_m (opf a, opf b)
| `List [a; b; `String "l"] -> O.Op_l (opf a, opf b)
| `List [`String s; obj; `String "BDC"] -> O.Op_BDC (s, object_of_json obj)
| `List [`String s; `String "gs"] -> O.Op_gs s
| `List [`String s; `String "Do"] -> O.Op_Do s
| `List [`String s; `String "CS"] -> O.Op_CS s
| `List [i; `String "j"] -> O.Op_j (opi i)
| `List [a; b; c; d; e; f; `String "cm"] ->
2021-10-01 17:59:05 +02:00
O.Op_cm
2021-10-01 21:58:46 +02:00
{Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c;
Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f}
2021-10-06 15:06:14 +02:00
| `List [`List fls; y; `String "d"] -> O.Op_d (map opf fls, opf y)
| `List [a; `String "w"] -> O.Op_w (opf a)
| `List [a; `String "J"] -> O.Op_J (opi a)
| `List [a; `String "M"] -> O.Op_M (opf a)
| `List [`String s; `String "ri"] -> O.Op_ri s
| `List [a; `String "i"] -> O.Op_i (opi a)
| `List [a; b; c; d; e; f; `String "c"] -> O.Op_c (opf a, opf b, opf c, opf d, opf e, opf f)
| `List [a; b; c; d; `String "v"] -> O.Op_v (opf a, opf b, opf c, opf d)
| `List [a; b; c; d; `String "y"] -> O.Op_y (opf a, opf b, opf c, opf d)
| `List [a; `String "Tc"] -> O.Op_Tc (opf a)
| `List [a; `String "Tw"] -> O.Op_Tw (opf a)
| `List [a; `String "Tz"] -> O.Op_Tz (opf a)
| `List [a; `String "TL"] -> O.Op_TL (opf a)
| `List [`String k; n; `String "Tf"] -> O.Op_Tf (k, opf n)
| `List [a; `String "Tr"] -> O.Op_Tr (opi a)
| `List [a; `String "Ts"] -> O.Op_Ts (opf a)
| `List [a; b; `String "Td"] -> O.Op_Td (opf a, opf b)
| `List [a; b; `String "TD"] -> O.Op_TD (opf a, opf b)
| `List [a; b; c; d; e; f; `String "Tm"] ->
2021-10-01 21:58:46 +02:00
O.Op_Tm
{Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c;
Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f}
2021-10-06 15:06:14 +02:00
| `List [`String s; `String "Tj"] -> Op_Tj s
| `List [obj; `String "TJ"] -> Op_TJ (object_of_json obj)
| `List [`String s; `String "'"] -> Op_' s
| `List [a; b; `String s; `String "''"] -> Op_'' (opf a, opf b, s)
| `List [a; b; `String "d0"] -> Op_d0 (opf a, opf b)
| `List [a; b; c; d; e; f; `String "d1"] -> Op_d1 (opf a, opf b, opf c, opf d, opf e, opf f)
| `List [`String s; `String "cs"] -> Op_cs s
| `List [a; `String "G"] -> Op_G (opf a);
| `List [a; `String "g"] -> Op_g (opf a);
| `List [a; b; c; `String "RG"] -> Op_RG (opf a, opf b, opf c);
| `List [a; b; c; `String "rg"] -> Op_rg (opf a, opf b, opf c);
| `List [a; b; c; d; `String "K"] -> Op_K (opf a, opf b, opf c, opf d);
| `List [`String s; `String "sh"] -> Op_sh s;
| `List [`String s; `String "MP"] -> Op_MP s;
| `List [`String s; `String "BMC"] -> Op_BMC s;
| `List [`String s; `String "Unknown"] -> O.Op_Unknown s
| `List [`String s; obj; `String "DP"] -> O.Op_DP (s, object_of_json obj)
| `List [a; `String b; `String "InlineImage"] ->
2021-10-04 14:34:13 +02:00
O.InlineImage (object_of_json a, Pdfio.bytes_of_string b)
2021-10-06 15:06:14 +02:00
| `List torev ->
2021-10-01 21:58:46 +02:00
begin match rev torev with
2021-10-06 15:06:14 +02:00
| `String "SCN"::ns -> O.Op_SCN (map opf (rev ns))
2021-10-12 20:52:03 +02:00
| `String "SC"::ns -> O.Op_SC (map opf (rev ns))
| `String "sc"::ns -> O.Op_sc (map opf (rev ns))
| `String "scn"::ns -> O.Op_scn (map opf (rev ns))
2021-10-06 15:06:14 +02:00
| `String "SCNName"::`String s::ns -> O.Op_SCNName (s, map opf (rev ns))
| `String "scnName"::`String s::ns -> O.Op_scnName (s, map opf (rev ns))
2021-10-01 21:58:46 +02:00
| j ->
2021-10-06 15:06:14 +02:00
Printf.eprintf "Unable to read reversed op from %s\n" (J.show (`List j));
2021-10-04 14:34:13 +02:00
error "op reading failed"
2021-10-01 21:58:46 +02:00
end
2021-10-01 20:41:23 +02:00
| j ->
Printf.eprintf "Unable to read op from %s\n" (J.show j);
2021-10-04 14:34:13 +02:00
error "op reading failed"
2021-10-01 17:59:05 +02:00
2021-10-03 19:11:29 +02:00
and object_of_json = function
2021-10-06 15:06:14 +02:00
| `Null -> P.Null
| `Bool b -> P.Boolean b
| `Int n -> Pdf.Indirect n
| `String s -> P.String s
| `List objs -> P.Array (map object_of_json objs)
| `Assoc ["I", `Int i] -> P.Integer i
| `Assoc ["F", `Float f] -> P.Real f
| `Assoc ["N", `String n] -> P.Name n
| `Assoc ["S", `List [dict; `String data]] ->
2021-10-05 14:08:46 +02:00
let d' =
P.add_dict_entry (object_of_json dict) "/Length" (P.Integer (String.length data))
in
P.Stream (ref (d', P.Got (Pdfio.bytes_of_string data)))
2021-10-06 15:06:14 +02:00
| `Assoc ["S", `List [dict; `List parsed_ops]] ->
2021-10-14 16:54:09 +02:00
begin match
Pdfops.stream_of_ops (List.map op_of_json parsed_ops)
with
| P.Stream {contents = (_, Pdf.Got data)} ->
let d' =
P.add_dict_entry (object_of_json dict) "/Length" (P.Integer (Pdfio.bytes_size data))
in
P.Stream (ref (d', Pdf.Got data))
| _ -> assert false
end
2021-10-06 15:06:14 +02:00
| `Assoc elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json o)) elts)
| _ -> error "not recognised in object_of_json"
2021-10-06 14:52:05 +02:00
let pdf_of_json json =
2021-10-06 15:06:14 +02:00
let objs = match json with `List objs -> objs | _ -> error "bad json top level" in
2021-10-06 14:52:05 +02:00
let params = ref Pdf.Null in
let trailerdict = ref Pdf.Null in
let objects =
option_map
(function
2021-10-06 15:06:14 +02:00
| `List [`Int objnum; o] ->
begin match objnum with
| -1 -> params := object_of_json o; None
| 0 -> trailerdict := object_of_json o; None
| n when n < 0 -> None
| n -> Some (n, object_of_json o)
end
2021-10-06 14:52:05 +02:00
| _ -> error "json bad obj")
objs
in
begin match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONstreamdataincluded" !params with
| Some (Pdf.Boolean false) -> error "no stream data; cannot reconstruct PDF"
| _ -> ()
end;
let major =
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONmajorpdfversion" !params with
Some (Pdf.Integer i) -> i | _ -> error "bad major version"
in
let minor =
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONminorpdfversion" !params with
Some (Pdf.Integer i) -> i | _ -> error "bad minor version"
in
let root =
match !trailerdict with Pdf.Dictionary d ->
begin match lookup "/Root" d with
Some (Pdf.Indirect i) -> i | _ -> error "bad root"
end
| _ -> error "bad root 2"
in
let objmap = P.pdfobjmap_empty () in
2021-10-06 18:04:54 +02:00
List.iter (fun (k, v) -> Hashtbl.add objmap k (ref (P.Parsed v), 0)) objects;
let objects =
{P.maxobjnum = 0;
P.parse = None;
P.pdfobjects = objmap;
P.object_stream_ids = Hashtbl.create 0}
in
{P.major;
P.minor;
P.root;
P.objects;
P.trailerdict = !trailerdict;
P.was_linearized = false;
P.saved_encryption = None}
let mkfloat f = `Assoc [("F", `Float f)]
let mkint i = `Assoc [("I", `Int i)]
let mkname n = `Assoc [("N", `String n)]
2021-10-06 14:52:05 +02:00
2021-10-12 17:09:58 +02:00
let rec json_of_object pdf fcs no_stream_data pcs = function
2021-10-06 14:52:05 +02:00
| P.Null -> `Null
| P.Boolean b -> `Bool b
2021-10-06 18:04:54 +02:00
| P.Integer i -> mkint i
| P.Real r -> mkfloat r
2021-10-06 14:52:05 +02:00
| P.String s -> `String s
2021-10-06 18:04:54 +02:00
| P.Name n -> mkname n
2021-10-12 17:09:58 +02:00
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
| P.Dictionary elts ->
iter
(function
("/Contents", P.Indirect i) ->
begin match Pdf.lookup_obj pdf i with
| Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is
| _ -> fcs i
end
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
| _ -> ())
elts;
2021-10-12 17:09:58 +02:00
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data pcs v)) elts)
2021-10-12 17:03:22 +02:00
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
P.getstream thestream;
2021-10-14 21:51:25 +02:00
let str, dict' =
2021-10-12 17:09:58 +02:00
match P.lookup_direct pdf "/FunctionType" d, pcs with
| Some _, true ->
2021-10-12 17:03:22 +02:00
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
2021-10-14 21:51:25 +02:00
let dict = P.remove_dict_entry d "/Filter" in
begin match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, dict) | _ -> error "/FunctionType: failure: decomp" end
2021-10-12 17:09:58 +02:00
| _ ->
2021-10-14 21:51:25 +02:00
if no_stream_data then ("<<stream data elided>>", d) else
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
in
2021-10-14 21:51:25 +02:00
json_of_object pdf fcs no_stream_data pcs (P.Dictionary [("S", P.Array [dict'; P.String str])])
2021-10-06 14:52:05 +02:00
| P.Stream _ -> error "error: stream with not-a-dictionary"
| P.Indirect i ->
begin match P.lookup_obj pdf i with
| P.Stream {contents = (P.Dictionary dict as d, _)} ->
begin match P.lookup_direct pdf "/Subtype" d with
| Some (P.Name "/Form") -> fcs i
2021-10-14 16:54:09 +02:00
| _ ->
begin match P.lookup_direct pdf "/Type" d with
| Some (P.Name "/Pattern") -> fcs i
| _ -> ()
end
end
| _ -> ()
end;
2021-10-06 14:52:05 +02:00
`Int i
2020-02-04 17:50:04 +01:00
let json_of_op pdf no_stream_data = function
2021-10-06 14:52:05 +02:00
| O.Op_S -> `List [`String "S"]
| O.Op_s -> `List [`String "s"]
| O.Op_f -> `List [`String "f"]
| O.Op_F -> `List [`String "F"]
| O.Op_f' ->`List [`String "f*"]
| O.Op_B -> `List [`String "B"]
| O.Op_B' -> `List [`String "B*"]
| O.Op_b -> `List [`String "b"]
| O.Op_b' -> `List [`String "b*"]
| O.Op_n -> `List [`String "n"]
| O.Op_W -> `List [`String "W"]
| O.Op_W' -> `List [`String "W*"]
| O.Op_BT -> `List [`String "BT"]
| O.Op_ET -> `List [`String "ET"]
| O.Op_q -> `List [`String "q"]
| O.Op_Q -> `List [`String "Q"]
| O.Op_h -> `List [`String "h"]
| O.Op_T' -> `List [`String "T*"]
| O.Op_EMC -> `List [`String "EMC"]
| O.Op_BX -> `List [`String "BX"]
| O.Op_EX -> `List [`String "EX"]
2020-01-30 17:07:40 +01:00
| O.Op_re (a, b, c, d) ->
2021-10-06 18:04:54 +02:00
`List [mkfloat a; mkfloat b; mkfloat c; mkfloat d; `String "re"]
2020-01-30 17:07:40 +01:00
| O.Op_k (c, m, y, k) ->
2021-10-06 18:04:54 +02:00
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
2021-10-12 17:09:58 +02:00
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "BDC"]
2021-10-06 14:52:05 +02:00
| O.Op_gs s -> `List [`String s; `String "gs"]
| O.Op_Do s -> `List [`String s; `String "Do"]
| O.Op_CS s -> `List [`String s; `String "CS"]
2021-10-06 18:04:54 +02:00
| O.Op_SCN fs -> `List ((map (fun x -> mkfloat x) fs) @ [`String "SCN"])
| O.Op_j j -> `List [mkint j; `String "j"]
2020-01-30 17:07:40 +01:00
| O.Op_cm t ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat t.Pdftransform.a; mkfloat t.Pdftransform.b; mkfloat t.Pdftransform.c;
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
2021-10-06 14:52:05 +02:00
`String "cm"]
2020-01-30 17:07:40 +01:00
| O.Op_d (fl, y) ->
2021-10-06 18:04:54 +02:00
`List [`List (map (fun x -> mkfloat x) fl); mkfloat y; `String "d"]
| O.Op_w w -> `List [mkfloat w; `String "w"]
| O.Op_J j -> `List [mkint j; `String "J"]
| O.Op_M m -> `List [mkfloat m; `String "M"]
2021-10-06 14:52:05 +02:00
| O.Op_ri s -> `List [`String s; `String "ri"]
2021-10-06 18:04:54 +02:00
| O.Op_i i -> `List [mkint i; `String "i"]
2021-10-14 21:28:43 +02:00
| O.Op_c (a, b, c, d, e, f) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
2021-10-14 21:28:43 +02:00
mkfloat d; mkfloat e; mkfloat f; `String "c"]
2020-01-31 15:24:36 +01:00
| O.Op_v (a, b, c, d) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
mkfloat d; `String "v"]
2020-01-31 15:24:36 +01:00
| O.Op_y (a, b, c, d) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
mkfloat d; `String "y"]
| O.Op_Tc c -> `List [mkfloat c; `String "Tc"]
| O.Op_Tw w -> `List [mkfloat w; `String "Tw"]
| O.Op_Tz z -> `List [mkfloat z; `String "Tz"]
| O.Op_TL l -> `List [mkfloat l; `String "TL"]
| O.Op_Tf (k, s) -> `List [`String k; mkfloat s; `String "Tf"]
| O.Op_Tr i -> `List [mkint i; `String "Tr"]
| O.Op_Ts k -> `List [mkfloat k; `String "Ts"]
| O.Op_Td (k, k') -> `List [mkfloat k; mkfloat k'; `String "Td"]
| O.Op_TD (k, k') -> `List [mkfloat k; mkfloat k'; `String "TD"]
2020-01-31 15:24:36 +01:00
| O.Op_Tm t ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat t.Pdftransform.a; mkfloat t.Pdftransform.b; mkfloat t.Pdftransform.c;
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
2021-10-06 14:52:05 +02:00
`String "Tm"]
| O.Op_Tj s -> `List [`String s; `String "Tj"]
2021-10-12 17:09:58 +02:00
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data false pdfobject; `String "TJ"]
2021-10-06 14:52:05 +02:00
| O.Op_' s -> `List [`String s; `String "'"]
2021-10-06 18:04:54 +02:00
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
2021-10-14 21:28:43 +02:00
| O.Op_d1 (a, b, c, d, e, f) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
2021-10-14 21:28:43 +02:00
mkfloat d; mkfloat e; mkfloat f; `String "d1"]
2021-10-06 14:52:05 +02:00
| O.Op_cs s -> `List [`String s; `String "cs"]
2021-10-06 18:04:54 +02:00
| O.Op_SC fs -> `List (map (fun x -> mkfloat x) fs @ [`String "SC"])
| O.Op_sc fs -> `List (map (fun x -> mkfloat x) fs @ [`String "sc"])
| O.Op_scn fs -> `List (map (fun x -> mkfloat x) fs @ [`String "scn"])
| O.Op_G k -> `List [mkfloat k; `String "G"]
| O.Op_g k -> `List [mkfloat k; `String "g"]
| O.Op_RG (r, g, b) -> `List [mkfloat r; mkfloat g; mkfloat b; `String "RG"]
| O.Op_rg (r, g, b) -> `List [mkfloat r; mkfloat g; mkfloat b; `String "rg"]
| O.Op_K (c, m, y, k) -> `List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "K"]
2021-10-06 14:52:05 +02:00
| O.Op_sh s -> `List [`String s; `String "sh"]
| O.Op_MP s -> `List [`String s; `String "MP"]
| O.Op_BMC s -> `List [`String s; `String "BMC"]
| O.Op_Unknown s -> `List [`String s; `String "Unknown"]
2020-01-31 15:46:33 +01:00
| O.Op_SCNName (s, fs) ->
2021-10-06 18:04:54 +02:00
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "SCNName"])
2020-01-31 15:46:33 +01:00
| O.Op_scnName (s, fs) ->
2021-10-06 18:04:54 +02:00
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
2021-10-06 14:52:05 +02:00
| O.InlineImage (dict, data) ->
2021-10-12 17:09:58 +02:00
`List [json_of_object pdf (fun _ -> ()) no_stream_data false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
2021-10-06 14:52:05 +02:00
| O.Op_DP (s, obj) ->
2021-10-12 17:09:58 +02:00
`List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "DP"]
2021-10-06 14:52:05 +02:00
2020-01-31 13:17:55 +01:00
(* parse_stream needs pdf and resources. These are for lexing of inline images,
2021-10-14 16:54:09 +02:00
* looking up the colourspace. *)
2020-01-31 13:17:55 +01:00
let parse_content_stream pdf resources bs =
2020-03-25 14:46:54 +01:00
let ops = O.parse_stream pdf resources [bs] in
2021-10-06 14:52:05 +02:00
`List (map (json_of_op pdf false) ops)
2020-01-30 14:10:03 +01:00
2021-10-14 18:42:00 +02:00
(* Make sure each page only has one page content stream. Otherwise,
2021-10-15 15:50:08 +02:00
if not split on op boundaries, each one would fail to parse on its own. *)
2021-10-14 21:28:43 +02:00
let precombine_page_content pdf =
2021-06-21 16:03:32 +02:00
let pages' =
map
(fun page ->
match page.Pdfpage.content with
[] | [_] -> page
| _ ->
let operators =
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
in
{page with Pdfpage.content = [Pdfops.stream_of_ops operators]}
)
(Pdfpage.pages_of_pagetree pdf)
in
Pdfpage.change_pages true pdf pages'
2021-10-29 19:17:18 +02:00
(* PDF strings (except /ID in the trailer dictionary) are either PDFDocEncoding
or UTF16BE. Many times the UTF16BE can all be represented in PDFDocEncoding.
In this case, there are just lots of \000 bytes getting in the way making the
JSON hard to edit. So we preprocess such simple UTF16BE strings into
PDFDocEncoding. *)
let preprocess_string s =
if Pdftext.is_unicode s
then Pdftext.pdfdocstring_of_utf8 (Pdftext.utf8_of_pdfdocstring s)
else s
let rec ppstring_single_object pdf = function
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
Pdf.Stream {contents = (Pdf.recurse_dict (ppstring_single_object pdf) dict, data)}
| Pdf.Array a -> Pdf.recurse_array (ppstring_single_object pdf) a
| Pdf.String s -> Pdf.String (preprocess_string s)
| x -> x
let preprocess_strings pdf =
Pdf.objselfmap (ppstring_single_object pdf) pdf;
pdf.Pdf.trailerdict <- ppstring_single_object pdf pdf.Pdf.trailerdict
2021-10-12 16:35:08 +02:00
let json_of_pdf
2021-10-14 21:28:43 +02:00
~parse_content ~no_stream_data ~decompress_streams
2021-10-12 16:35:08 +02:00
pdf
=
2021-10-29 19:17:18 +02:00
preprocess_strings pdf;
2021-10-14 21:28:43 +02:00
let pdf = if parse_content then precombine_page_content pdf else pdf in
2021-10-04 19:38:36 +02:00
if decompress_streams then
2021-10-06 18:04:54 +02:00
Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf;
2021-06-21 16:03:32 +02:00
Pdf.remove_unreferenced pdf;
2021-10-12 17:09:58 +02:00
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data false pdf.P.trailerdict) in
2021-10-01 13:16:55 +02:00
let parameters =
2021-10-12 17:09:58 +02:00
(-1, json_of_object pdf (fun x -> ()) false false
2021-10-01 16:21:03 +02:00
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
2021-10-01 13:16:55 +02:00
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
2021-10-01 16:21:03 +02:00
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major);
("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor);
]))
2021-10-01 13:16:55 +02:00
in
2020-01-30 14:10:03 +01:00
let content_streams = ref [] in
2021-10-04 19:38:36 +02:00
let fcs n =
content_streams := n::!content_streams;
if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)
in
2020-01-30 14:10:03 +01:00
let pairs =
let ps = ref [] in
2020-03-25 14:46:54 +01:00
P.objiter
2020-01-30 14:10:03 +01:00
(fun i pdfobj ->
2021-10-12 17:09:58 +02:00
ps := (i, json_of_object pdf fcs no_stream_data parse_content pdfobj)::!ps)
2020-01-30 14:10:03 +01:00
pdf;
2021-10-14 22:29:03 +02:00
parameters::trailerdict::sort compare !ps
2020-01-30 14:10:03 +01:00
in
let pairs_parsed =
if not parse_content then pairs else
2020-03-04 19:50:32 +01:00
map
2020-01-30 14:10:03 +01:00
(fun (objnum, obj) ->
2020-03-04 19:50:32 +01:00
if mem objnum !content_streams then
2020-01-30 14:10:03 +01:00
begin match obj with
2021-10-06 14:52:05 +02:00
| `Assoc ["S", `List [dict; `String _]] ->
2020-01-31 13:17:55 +01:00
let streamdata =
2020-03-25 14:46:54 +01:00
match P.lookup_obj pdf objnum with
| P.Stream {contents = (_, P.Got b)} -> b
2021-10-04 14:34:13 +02:00
| _ -> error "JSON: stream not decoded"
2020-01-31 13:17:55 +01:00
in
2021-10-14 18:42:00 +02:00
let dict =
match dict with
| `Assoc d ->
`Assoc (option_map (function (("/Filter" | "/Length"), _) -> None | (a, b) -> Some (a, b)) d)
| _ -> assert false
in
(objnum,
`Assoc ["S", `List [dict; parse_content_stream pdf (P.Dictionary []) streamdata]])
2021-10-04 14:34:13 +02:00
| _ -> error "json_of_pdf: stream parsing inconsistency"
2020-01-30 14:10:03 +01:00
end
else
(objnum, obj))
pairs
in
2021-10-06 14:52:05 +02:00
`List
2020-03-04 19:50:32 +01:00
(map
2021-10-06 14:52:05 +02:00
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
2020-01-30 14:10:03 +01:00
pairs_parsed)
2020-01-30 11:42:24 +01:00
2021-10-14 21:28:43 +02:00
let to_output o ~parse_content ~no_stream_data ~decompress_streams pdf =
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams pdf in
2021-10-12 20:52:03 +02:00
match o.Pdfio.out_caml_channel with
| Some ch -> J.pretty_to_channel ch json
| None -> o.Pdfio.output_string (J.pretty_to_string json)
2021-10-01 13:16:55 +02:00
2021-10-01 13:53:21 +02:00
let of_input i =
2021-10-12 20:52:03 +02:00
try
match i.Pdfio.caml_channel with
2021-10-14 16:54:09 +02:00
| Some ch ->
2021-10-14 16:55:56 +02:00
pdf_of_json (J.from_channel ch)
2021-10-12 20:52:03 +02:00
| None ->
2021-10-14 16:54:09 +02:00
let content = Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 i.Pdfio.in_channel_length) in
2021-10-12 20:52:03 +02:00
pdf_of_json (J.from_string content)
with
e -> error (Printexc.to_string e)