more

2025-06-05 22:09:39 +02:00 · 2021-12-27 19:21:59 +00:00
parent 974025f7ac
commit 83d484a15c
3 changed files with 87 additions and 66 deletions
--- a/cpdfbookmarks.ml
+++ b/cpdfbookmarks.ml
@@ -192,75 +192,75 @@ let output_json_marks ch calculate_page_number pdf fastrefnums marks =
  let json = `List (map json_of_mark marks) in
    J.pretty_to_channel ch json
 let process_string encoding s =
  let rec replace c x y = function
  | [] -> []
  | h::t when h = c -> x::y::replace c x y t
  | h::t -> h::replace c x y t
  in
    (* Convert to UTF8, raw, or stripped, and escape backslashed and quotation marks *)
    let codepoints = Pdftext.codepoints_of_pdfdocstring s in
      let escaped =
        let bs = int_of_char '\\'
        and nl = int_of_char '\n'
        and n = int_of_char 'n'
        and q = int_of_char '\"' in
          replace q bs q (replace nl bs n (replace bs bs bs codepoints))
      in
        let process_stripped escaped =
          let b = Buffer.create 200 in
            iter
              (fun x ->
                 if x <= 127 then Buffer.add_char b (char_of_int x))
              escaped;
            Buffer.contents b
        in
        match encoding with
        | Cpdfmetadata.UTF8 -> Pdftext.utf8_of_codepoints escaped
        | Cpdfmetadata.Stripped -> process_stripped escaped
        | Cpdfmetadata.Raw -> s
 (* List the bookmarks in the given range to the given output *)
 let list_bookmarks ~json encoding range pdf output =
-  let process_stripped escaped =
+  let bookmarks = Pdfmarks.read_bookmarks pdf in
-    let b = Buffer.create 200 in
+  let refnums = Pdf.page_reference_numbers pdf in
-      iter
+  let rangetable = hashset_of_list range in
-        (fun x ->
+  let range_is_all = range = ilist 1 (Pdfpage.endpage pdf) in
-           if x <= 127 then Buffer.add_char b (char_of_int x))
+  let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
-        escaped;
+    (* Find the pagenumber of each bookmark target. If it is in the range,
-      Buffer.contents b
+     * keep that bookmark. Also keep the bookmark if its target is the null
-  in
+     * destination. *)
-  let process_string s =
+    let inrange =
-    let rec replace c x y = function
+      keep
-    | [] -> []
+        (function x ->
-    | h::t when h = c -> x::y::replace c x y t
+           range_is_all || 
-    | h::t -> h::replace c x y t
+           x.Pdfmarks.target = Pdfdest.NullDestination ||
           (match x.Pdfmarks.target with Pdfdest.NamedDestinationElsewhere _ -> true | _ -> false) ||
           Hashtbl.mem rangetable (Pdfpage.pagenumber_of_target ~fastrefnums pdf x.Pdfmarks.target)) bookmarks
    in
-      (* Convert to UTF8, raw, or stripped, and escape backslashed and quotation marks *)
+      let calculate_page_number mark =
-      let codepoints = Pdftext.codepoints_of_pdfdocstring s in
+        (* Some buggy PDFs use integers for page numbers instead of page
-        let escaped =
+         * object references. Adobe Reader and Preview seem to support
-          let bs = int_of_char '\\'
+         * this, for presumably historical reasons. So if we see a
-          and nl = int_of_char '\n'
+         * OtherDocPageNumber (which is what Pdfdest parses these as,
-          and n = int_of_char 'n'
+         * because that's what they are legitimately, we use this as the
-          and q = int_of_char '\"' in
+         * page number. It is zero based, though, and we are one-based, so
-            replace q bs q (replace nl bs n (replace bs bs bs codepoints))
+         * we add one. Pdfpage.pagenumber_of_target has been modified to support this.*)
-        in
+        Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target
-          match encoding with
+      in
-          | Cpdfmetadata.UTF8 -> Pdftext.utf8_of_codepoints escaped
+        if json then
-          | Cpdfmetadata.Stripped -> process_stripped escaped
+          output_json_marks stdout calculate_page_number pdf fastrefnums inrange
-          | Cpdfmetadata.Raw -> s
+        else
-    in
+          iter
-      let bookmarks = Pdfmarks.read_bookmarks pdf in
+            (function mark ->
-      let refnums = Pdf.page_reference_numbers pdf in
+               output.Pdfio.output_string
-      let rangetable = hashset_of_list range in
+                 (Printf.sprintf "%i \"%s\" %i%s %s\n"
-      let range_is_all = range = ilist 1 (Pdfpage.endpage pdf) in
+                   mark.Pdfmarks.level
-      let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
+                   (process_string encoding mark.Pdfmarks.text)
-        (* Find the pagenumber of each bookmark target. If it is in the range,
+                   (calculate_page_number mark)
-         * keep that bookmark. Also keep the bookmark if its target is the null
+                   (if mark.Pdfmarks.isopen then " open" else "")
-         * destination. *)
+                   (output_string_of_target pdf fastrefnums mark.Pdfmarks.target)))
-        let inrange =
+            inrange
          keep
            (function x ->
               range_is_all || 
               x.Pdfmarks.target = Pdfdest.NullDestination ||
               (match x.Pdfmarks.target with Pdfdest.NamedDestinationElsewhere _ -> true | _ -> false) ||
               Hashtbl.mem rangetable (Pdfpage.pagenumber_of_target ~fastrefnums pdf x.Pdfmarks.target)) bookmarks
        in
          let calculate_page_number mark =
            (* Some buggy PDFs use integers for page numbers instead of page
             * object references. Adobe Reader and Preview seem to support
             * this, for presumably historical reasons. So if we see a
             * OtherDocPageNumber (which is what Pdfdest parses these as,
             * because that's what they are legitimately, we use this as the
             * page number. It is zero based, though, and we are one-based, so
             * we add one. Pdfpage.pagenumber_of_target has been modified to support this.*)
            Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target
          in
            if json then
              output_json_marks stdout calculate_page_number pdf fastrefnums inrange
            else
              iter
                (function mark ->
                   output.Pdfio.output_string
                     (Printf.sprintf "%i \"%s\" %i%s %s\n"
                       mark.Pdfmarks.level
                       (process_string mark.Pdfmarks.text)
                       (calculate_page_number mark)
                       (if mark.Pdfmarks.isopen then " open" else "")
                       (output_string_of_target pdf fastrefnums mark.Pdfmarks.target)))
                inrange
 (* o is the stamp, u is the main pdf page *)
--- a/cpdfmanual.pdf
+++ b/cpdfmanual.pdf
--- a/cpdfmanual.tex
+++ b/cpdfmanual.tex
@@ -1562,6 +1562,9 @@ There are two options which turn off parts of the squeezer. They are \texttt{-sq
  \begin{framed}
  \small\noindent\verb!cpdf -list-bookmarks [-utf8 | -raw] in.pdf!
  \vspace{1.5mm}
  \small\noindent\verb!cpdf -list-bookmarks-json in.pdf!
  \vspace{1.5mm}
  \small\noindent\verb!cpdf -remove-bookmarks in.pdf -o out.pdf!
@@ -1608,6 +1611,24 @@ the ASCII range. To prevent this, and return unicode UTF8 output, add the
 \texttt{-utf8} option to the command. To prevent any processing, use the
 \texttt{-raw} option. See Section \ref{textencodings} for more information. A newline in a bookmark is represented as \texttt{"\textbackslash n"}.
 By using \texttt{-list-bookmarks-json} instead, the bookmarks are formatted as a JSON array, in order, of dictionaries formatted thus:
 \begin{verbatim}
 { "level": 0,
  "text": "1 Basic Usage",
  "page": 17,
  "open": false,
  "target":
    [ { "I": 17 },
      { "N": "/XYZ" },
      { "F": 85.039 },
      { "F": 609.307 },
      null ]
 }
 \end{verbatim}
 See chapter 15 for more details of cpdf's JSON formatting. Bookmark text in JSON bookmarks, however, is in UTF8 for ease of use.
 \subsection{Destinations}
 The destination is an extended description of where the bookmark should point to (i.e it can be more detailed than just giving the page). For example, it may point to a section heading halfway down a page. Here are the possibilities: