66 lines
2.1 KiB
OCaml
66 lines
2.1 KiB
OCaml
open Pdfutil
|
|
|
|
let removetext range pdf =
|
|
(* Could fail on nesting, or other marked content inside our marked content.*)
|
|
let rec remove_until_last_EMC level = function
|
|
| [] -> []
|
|
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
|
remove_until_last_EMC (level + 1) more
|
|
| Pdfops.Op_EMC::more ->
|
|
if level = 1
|
|
then more
|
|
else remove_until_last_EMC (level - 1) more
|
|
| _::more ->
|
|
remove_until_last_EMC level more
|
|
in
|
|
let rec remove_stamps prev = function
|
|
| [] -> rev prev
|
|
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
|
let rest = remove_until_last_EMC 1 more in
|
|
remove_stamps prev rest
|
|
| h::t -> remove_stamps (h::prev) t
|
|
in
|
|
let removetext_page _ page =
|
|
{page with
|
|
Pdfpage.content =
|
|
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
|
|
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
|
|
in
|
|
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
|
|
|
|
let rec remove_all_text_ops pdf resources content =
|
|
let is_textop = function
|
|
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
|
|
| _ -> false
|
|
in
|
|
let content' =
|
|
let ops = Pdfops.parse_operators pdf resources content in
|
|
Pdfops.stream_of_ops
|
|
(option_map (function x -> if is_textop x then None else Some x) ops)
|
|
in
|
|
[content']
|
|
|
|
let remove_all_text_page pdf p =
|
|
let resources = p.Pdfpage.resources in
|
|
let content = p.Pdfpage.content in
|
|
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
|
|
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
|
|
|
|
let remove_all_text range pdf =
|
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
|
let pagenums = indx pages in
|
|
let pdf = ref pdf in
|
|
let pages' = ref [] in
|
|
iter2
|
|
(fun p pagenum ->
|
|
let p', pdf' =
|
|
if mem pagenum range
|
|
then remove_all_text_page !pdf p
|
|
else p, !pdf
|
|
in
|
|
pdf := pdf';
|
|
pages' =| p')
|
|
pages
|
|
pagenums;
|
|
Pdfpage.change_pages true !pdf (rev !pages')
|