From ee8a10aae415e2491b88ba0883be23cd2cfef631 Mon Sep 17 00:00:00 2001
From: John Whitington <john@coherentgraphics.co.uk>
Date: Fri, 29 Oct 2021 18:17:18 +0100
Subject: [PATCH] Preprocess UTF16BE strings

---
 cpdfjson.ml | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/cpdfjson.ml b/cpdfjson.ml
index 141d4e1..922f66d 100644
--- a/cpdfjson.ml
+++ b/cpdfjson.ml
@@ -6,12 +6,12 @@ object, one for each object in the file and two special ones:
 Object -1: CPDF's own data with the PDF version number, CPDF JSON format
 number, and flags used when writing (which may be required when reading):
 
-  o /CPDFJSONformatversion (integer, currently 2)
+  o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2)
   o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
   o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
   round-trip if false).
-  o /CPDFJSONmajorpdfversion (integer)
-  o /CPDFJSONminorpdfversion (integer)
+  o /CPDFJSONmajorpdfversion (CPDFJSON integer)
+  o /CPDFJSONminorpdfversion (CPDFJSON integer)
 
 Object 0: The PDF's trailer dictionary
 
@@ -24,6 +24,11 @@ Objects 1..n: The PDF's objects.
   o Indirect references are integers
   o Streams are {"S": [dict, data]}
 
+  o Strings are converted from PDFDocEncoding to UTF8 before being encoded in
+  JSON. When they are read back the process is JSON encoded --> UTF8 -->
+  PDFDocEncoding. This process is to allow easier editing of strings. This
+  does not happen to strings within text operators in parsed content streams. 
+
 There are two subformats: parsing content streams or not.  Hello World in CPDF
 JSON without parsing content streams:
 
@@ -448,10 +453,33 @@ let precombine_page_content pdf =
   in
     Pdfpage.change_pages true pdf pages'
 
+(* PDF strings (except /ID in the trailer dictionary) are either PDFDocEncoding
+or UTF16BE. Many times the UTF16BE can all be represented in PDFDocEncoding.
+In this case, there are just lots of \000 bytes getting in the way making the
+JSON hard to edit. So we preprocess such simple UTF16BE strings into
+PDFDocEncoding. *)
+let preprocess_string s =
+  if Pdftext.is_unicode s
+    then Pdftext.pdfdocstring_of_utf8 (Pdftext.utf8_of_pdfdocstring s)
+    else s
+
+let rec ppstring_single_object pdf = function
+  | Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
+  | (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
+      Pdf.Stream {contents = (Pdf.recurse_dict (ppstring_single_object pdf) dict, data)}
+  | Pdf.Array a -> Pdf.recurse_array (ppstring_single_object pdf) a
+  | Pdf.String s -> Pdf.String (preprocess_string s)
+  | x -> x
+
+let preprocess_strings pdf =
+    Pdf.objselfmap (ppstring_single_object pdf) pdf;
+    pdf.Pdf.trailerdict <- ppstring_single_object pdf pdf.Pdf.trailerdict
+
 let json_of_pdf
   ~parse_content ~no_stream_data ~decompress_streams
   pdf
 =
+  preprocess_strings pdf;
   let pdf = if parse_content then precombine_page_content pdf else pdf in
   if decompress_streams then
     Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf;