remove slashes from print_struct_tree

This commit is contained in:
John Whitington 2024-09-24 15:45:32 +01:00
parent d8c91180bc
commit 6495daba8c
3 changed files with 27 additions and 24 deletions

Binary file not shown.

View File

@ -5201,8 +5201,8 @@ To enable the generation of structure information, we may add \texttt{-draw-stru
\begin{verbatim} \begin{verbatim}
$cpdf -print-struct-tree out.pdf $cpdf -print-struct-tree out.pdf
/StructTreeRoot StructTreeRoot
└── /P (1)\end{verbatim} └── P (1)\end{verbatim}
\noindent To prevent such automatic tagging, relying only on manual tags, use \texttt{-no-auto-tags}. The effect may be reversed at any point with \texttt{-auto-tags}. Unless told otherwise, Cpdf auto-tags text as paragraphs /P, and images as /Figure. \noindent To prevent such automatic tagging, relying only on manual tags, use \texttt{-no-auto-tags}. The effect may be reversed at any point with \texttt{-auto-tags}. Unless told otherwise, Cpdf auto-tags text as paragraphs /P, and images as /Figure.
@ -5226,10 +5226,10 @@ There are two types of tag we can add manually. One kind is used to tag individu
\noindent And here is the structure tree: \noindent And here is the structure tree:
\begin{verbatim} \begin{verbatim}
/StructTreeRoot StructTreeRoot
├── /H1 (1) ├── H1 (1)
├── /P (1) ├── P (1)
└── /P (1) └── P (1)
\end{verbatim} \end{verbatim}
\noindent Content tagging is flat - every part of the content of a page is part of only one \texttt{-tag}. The logical structure of a document, however, is a tree structure -- sections contain paragraphs, and so on. To build the logical structure tree, we add structure tags using \texttt{-stag} / \texttt{-end-stag} pairs which, of course, may be nested. For example, let's put our H1, and P sections in a Section structure tag: \noindent Content tagging is flat - every part of the content of a page is part of only one \texttt{-tag}. The logical structure of a document, however, is a tree structure -- sections contain paragraphs, and so on. To build the logical structure tree, we add structure tags using \texttt{-stag} / \texttt{-end-stag} pairs which, of course, may be nested. For example, let's put our H1, and P sections in a Section structure tag:
@ -5246,11 +5246,11 @@ There are two types of tag we can add manually. One kind is used to tag individu
\noindent Here is the structure tree: \noindent Here is the structure tree:
\begin{verbatim} \begin{verbatim}
/StructTreeRoot StructTreeRoot
└──/Section (1) └──Section (1)
├── /H1 (1) ├── H1 (1)
├── /P (1) ├── P (1)
└── /P (1) └── P (1)
\end{verbatim} \end{verbatim}
\noindent Some PDF standards require that everything not marked as content (e.g paragraph, figure) etc. is marked as a an artifact. For example, a background image which is the same on every page, or a page border. This tells PDF processors that it is not logical content. \noindent Some PDF standards require that everything not marked as content (e.g paragraph, figure) etc. is marked as a an artifact. For example, a background image which is the same on every page, or a page border. This tells PDF processors that it is not logical content.
@ -5356,18 +5356,18 @@ We can print an abbreviated form of the structure tree to standard output:
\begin{minipage}{\linewidth} \begin{minipage}{\linewidth}
\begin{framed} \begin{framed}
\begin{verbatim} \begin{verbatim}
/StructTreeRoot StructTreeRoot
└── /Document └── Document
├── /Sect ├── Sect
│ ├── /P (1) │ ├── P (1)
│ │ ├── /Span (1) │ │ ├── Span (1)
│ └── /Figure (1) │ └── Figure (1)
├── /Sect ├── Sect
│ ├── /H1 (2) │ ├── H1 (2)
│ └── /TOC │ └── TOC
│ ├── /TOCI │ ├── TOCI
│ │ └── /P │ │ └── P
│ │ └── /Link (2) │ │ └── Link (2)
. . . .
. . . .
. . . .

View File

@ -1584,6 +1584,9 @@ let rec remove_empty = function
let cs' = map remove_empty cs in let cs' = map remove_empty cs in
E2 (n, attrs, lose (function E2 ("", _, []) -> true | _ -> false) cs') E2 (n, attrs, lose (function E2 ("", _, []) -> true | _ -> false) cs')
let rec remove_slashes = function
E2 (n, attrs, cs) -> E2 ((match n with "" -> "" | n -> implode (tl (explode n))), attrs, map remove_slashes cs)
let print_struct_tree pdf = let print_struct_tree pdf =
let page_lookup = let page_lookup =
hashtable_of_dictionary (combine (Pdf.page_reference_numbers pdf) (ilist 1 (Pdfpage.endpage pdf))) hashtable_of_dictionary (combine (Pdf.page_reference_numbers pdf) (ilist 1 (Pdfpage.endpage pdf)))
@ -1599,7 +1602,7 @@ let print_struct_tree pdf =
(Cpdfprinttree.to_string (Cpdfprinttree.to_string
~get_name:(fun (E2 (x, a, _)) -> if int_of_string (get_page a) > 0 then x ^ " (" ^ get_page a ^ ")" else x) ~get_name:(fun (E2 (x, a, _)) -> if int_of_string (get_page a) > 0 then x ^ " (" ^ get_page a ^ ")" else x)
~get_children:(fun (E2 (_, _, cs)) -> cs) ~get_children:(fun (E2 (_, _, cs)) -> cs)
(remove_empty st)) (remove_empty (remove_slashes st)))
let cpdfua_args title = let cpdfua_args title =
[ "-create-pdf"; [ "-create-pdf";