more
This commit is contained in:
parent
ee8a10aae4
commit
f071f1805a
12
cpdfjson.ml
12
cpdfjson.ml
|
@ -453,22 +453,12 @@ let precombine_page_content pdf =
|
||||||
in
|
in
|
||||||
Pdfpage.change_pages true pdf pages'
|
Pdfpage.change_pages true pdf pages'
|
||||||
|
|
||||||
(* PDF strings (except /ID in the trailer dictionary) are either PDFDocEncoding
|
|
||||||
or UTF16BE. Many times the UTF16BE can all be represented in PDFDocEncoding.
|
|
||||||
In this case, there are just lots of \000 bytes getting in the way making the
|
|
||||||
JSON hard to edit. So we preprocess such simple UTF16BE strings into
|
|
||||||
PDFDocEncoding. *)
|
|
||||||
let preprocess_string s =
|
|
||||||
if Pdftext.is_unicode s
|
|
||||||
then Pdftext.pdfdocstring_of_utf8 (Pdftext.utf8_of_pdfdocstring s)
|
|
||||||
else s
|
|
||||||
|
|
||||||
let rec ppstring_single_object pdf = function
|
let rec ppstring_single_object pdf = function
|
||||||
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
|
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
|
||||||
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
|
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
|
||||||
Pdf.Stream {contents = (Pdf.recurse_dict (ppstring_single_object pdf) dict, data)}
|
Pdf.Stream {contents = (Pdf.recurse_dict (ppstring_single_object pdf) dict, data)}
|
||||||
| Pdf.Array a -> Pdf.recurse_array (ppstring_single_object pdf) a
|
| Pdf.Array a -> Pdf.recurse_array (ppstring_single_object pdf) a
|
||||||
| Pdf.String s -> Pdf.String (preprocess_string s)
|
| Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s)
|
||||||
| x -> x
|
| x -> x
|
||||||
|
|
||||||
let preprocess_strings pdf =
|
let preprocess_strings pdf =
|
||||||
|
|
Loading…
Reference in New Issue