From 69c988c756fede127e9bba7689dd194b6e40e534 Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Thu, 9 Sep 2021 18:28:55 +0200 Subject: [PATCH] DocDiff application: LCS algorithm --- Pdf4QtLib/Pdf4QtLib.pro | 1 + Pdf4QtLib/sources/pdfalgorithmlcs.h | 126 ++++++++++++++++++++++++++++ Pdf4QtLib/sources/pdfdiff.cpp | 71 ++++++++++++++-- Pdf4QtLib/sources/pdfdiff.h | 12 ++- Pdf4QtLib/sources/pdfpainter.cpp | 102 ++++++++++++++++++---- Pdf4QtLib/sources/pdfpainter.h | 17 +++- 6 files changed, 303 insertions(+), 26 deletions(-) create mode 100644 Pdf4QtLib/sources/pdfalgorithmlcs.h diff --git a/Pdf4QtLib/Pdf4QtLib.pro b/Pdf4QtLib/Pdf4QtLib.pro index 7dffe03..72743f3 100644 --- a/Pdf4QtLib/Pdf4QtLib.pro +++ b/Pdf4QtLib/Pdf4QtLib.pro @@ -110,6 +110,7 @@ SOURCES += \ HEADERS += \ sources/pdfaction.h \ sources/pdfadvancedtools.h \ + sources/pdfalgorithmlcs.h \ sources/pdfannotation.h \ sources/pdfblendfunction.h \ sources/pdfccittfaxdecoder.h \ diff --git a/Pdf4QtLib/sources/pdfalgorithmlcs.h b/Pdf4QtLib/sources/pdfalgorithmlcs.h new file mode 100644 index 0000000..90fa190 --- /dev/null +++ b/Pdf4QtLib/sources/pdfalgorithmlcs.h @@ -0,0 +1,126 @@ +// Copyright (C) 2021 Jakub Melka +// +// This file is part of PDF4QT. +// +// PDF4QT is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// with the written consent of the copyright owner, any later version. +// +// PDF4QT is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDF4QT. If not, see . + +#ifndef PDFALGORITHMLCS_H +#define PDFALGORITHMLCS_H + +#include "pdfglobal.h" + +namespace pdf +{ + +/// Algorithm for computing longest common subsequence, on two sequences +/// of objects, which are implementing operator "==" (equal operator). +/// Constructor takes bidirectional iterators to the sequence. So, iterators +/// are requred to be bidirectional. +template +class PDFAlgorithmLongestCommonSubsequence +{ +public: + PDFAlgorithmLongestCommonSubsequence(Iterator it1, + Iterator it1End, + Iterator it2, + Iterator it2End, + Comparator comparator); + + void perform(); + +private: + Iterator m_it1; + Iterator m_it1End; + Iterator m_it2; + Iterator m_it2End; + + size_t m_size1; + size_t m_size2; + size_t m_matrixSize; + + Comparator m_comparator; + + std::vector m_backtrackData; +}; + +template +PDFAlgorithmLongestCommonSubsequence::PDFAlgorithmLongestCommonSubsequence(Iterator it1, + Iterator it1End, + Iterator it2, + Iterator it2End, + Comparator comparator) : + m_it1(std::move(it1)), + m_it1End(std::move(it1End)), + m_it2(std::move(it2)), + m_it2End(std::move(it2End)), + m_size1(0), + m_size2(0), + m_matrixSize(0), + m_comparator(std::move(comparator)) +{ + m_size1 = std::distance(m_it1, m_it1End) + 1; + m_size2 = std::distance(m_it2, m_it2End) + 1; + m_matrixSize = m_size1 * m_size2; +} + +template +void PDFAlgorithmLongestCommonSubsequence::perform() +{ + m_backtrackData.resize(m_matrixSize); + + std::vector rowTop(m_size1, size_t()); + std::vector rowBottom(m_size1, size_t()); + + // Jakub Melka: we will have columns consisting of it1...it1End + // and rows consisting of it2...it2End. We iterate trough rows, + // and for each row, we update longest common subsequence data. + + auto it2 = m_it2; + for (size_t i2 = 1; i2 < m_size2; ++i2, ++it2) + { + auto it1 = m_it1; + for (size_t i1 = 1; i1 < m_size1; ++i1, ++it1) + { + if (m_comparator(*it1, *it2)) + { + // We have match + rowBottom[i1] = rowTop[i1 - 1] + 1; + } + else + { + const size_t leftCellValue = rowBottom[i1 - 1]; + const size_t upperCellValue = rowTop[i1]; + bool isLeftBigger = leftCellValue > upperCellValue; + + if (isLeftBigger) + { + rowBottom[i1] = leftCellValue; + m_backtrackData[i2 * m_size1 + i1] = true; + } + else + { + rowBottom[i1] = upperCellValue; + m_backtrackData[i2 * m_size1 + i1] = false; + } + } + } + + // Bottom row will become top row + std::swap(rowTop, rowBottom); + } +} + +} // namespace pdf + +#endif // PDFALGORITHMLCS_H diff --git a/Pdf4QtLib/sources/pdfdiff.cpp b/Pdf4QtLib/sources/pdfdiff.cpp index 98e9246..ad7266d 100644 --- a/Pdf4QtLib/sources/pdfdiff.cpp +++ b/Pdf4QtLib/sources/pdfdiff.cpp @@ -23,6 +23,7 @@ #include "pdfcms.h" #include "pdfcompiler.h" #include "pdfconstants.h" +#include "pdfalgorithmlcs.h" #include @@ -34,7 +35,7 @@ PDFDiff::PDFDiff(QObject* parent) : m_progress(nullptr), m_leftDocument(nullptr), m_rightDocument(nullptr), - m_options(Asynchronous), + m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), m_epsilon(0.0001), m_cancelled(false) { @@ -173,6 +174,7 @@ void PDFDiff::stepProgress() struct PDFDiffPageContext { PDFInteger pageIndex = 0; + std::array pageHash = { }; PDFPrecompiledPage::GraphicPieceInfos graphicPieces; }; @@ -208,8 +210,11 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: PDFRenderer renderer(m_leftDocument, &fontCache, cms.data(), &optionalContentActivity, features, pdf::PDFMeshQualitySettings()); renderer.compile(&compiledPage, context.pageIndex); - PDFReal epsilon = calculateEpsilonForPage(m_leftDocument->getCatalog()->getPage(context.pageIndex)); - context.graphicPieces = compiledPage.calculateGraphicPieceInfos(epsilon); + auto page = m_leftDocument->getCatalog()->getPage(context.pageIndex); + PDFReal epsilon = calculateEpsilonForPage(page); + context.graphicPieces = compiledPage.calculateGraphicPieceInfos(page->getMediaBox(), epsilon); + + finalizeGraphicsPieces(context); }; PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftPreparedPages.begin(), leftPreparedPages.end(), fillPageContext); stepProgress(); @@ -233,14 +238,33 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: PDFRenderer renderer(m_rightDocument, &fontCache, cms.data(), &optionalContentActivity, features, pdf::PDFMeshQualitySettings()); renderer.compile(&compiledPage, context.pageIndex); - PDFReal epsilon = calculateEpsilonForPage(m_leftDocument->getCatalog()->getPage(context.pageIndex)); - context.graphicPieces = compiledPage.calculateGraphicPieceInfos(epsilon); + const PDFPage* page = m_leftDocument->getCatalog()->getPage(context.pageIndex); + PDFReal epsilon = calculateEpsilonForPage(page); + context.graphicPieces = compiledPage.calculateGraphicPieceInfos(page->getMediaBox(), epsilon); + + finalizeGraphicsPieces(context); }; PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, rightPreparedPages.begin(), rightPreparedPages.end(), fillPageContext); stepProgress(); } + // StepMatchPages + if (!m_cancelled) + { + // Match pages + auto comparePages = [](const PDFDiffPageContext& left, const PDFDiffPageContext& right) + { + return left.pageHash == right.pageHash; + }; + PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(), + rightPreparedPages.cbegin(), rightPreparedPages.cend(), + comparePages); + algorithm.perform(); + + stepProgress(); + } + // StepExtractTextLeftDocument if (!m_cancelled) { @@ -266,6 +290,43 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: } } +void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context) +{ + std::sort(context.graphicPieces.begin(), context.graphicPieces.end()); + + // Compute page hash using active settings + QCryptographicHash hasher(QCryptographicHash::Sha512); + hasher.reset(); + + for (const PDFPrecompiledPage::GraphicPieceInfo& info : context.graphicPieces) + { + if (info.isText() && !m_options.testFlag(PC_Text)) + { + continue; + } + if (info.isVectorGraphics() && !m_options.testFlag(PC_VectorGraphics)) + { + continue; + } + if (info.isImage() && !m_options.testFlag(PC_Images)) + { + continue; + } + if (info.isShading() && !m_options.testFlag(PC_Mesh)) + { + continue; + } + + hasher.addData(reinterpret_cast(info.hash.data()), int(info.hash.size())); + } + + QByteArray hash = hasher.result(); + Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64); + + size_t size = qMin(hash.length(), context.pageHash.size()); + std::copy(hash.data(), hash.data() + size, context.pageHash.data()); +} + void PDFDiff::onComparationPerformed() { m_cancelled = false; diff --git a/Pdf4QtLib/sources/pdfdiff.h b/Pdf4QtLib/sources/pdfdiff.h index ce3a046..9256c4d 100644 --- a/Pdf4QtLib/sources/pdfdiff.h +++ b/Pdf4QtLib/sources/pdfdiff.h @@ -31,6 +31,8 @@ namespace pdf { +struct PDFDiffPageContext; + class PDFDiffResult { public: @@ -57,8 +59,12 @@ public: enum Option { - None = 0x0000, - Asynchronous = 0x0001, ///< Compare document asynchronously + None = 0x0000, + Asynchronous = 0x0001, ///< Compare document asynchronously + PC_Text = 0x0002, ///< Use text to compare pages (determine, which pages correspond to each other) + PC_VectorGraphics = 0x0004, ///< Use vector graphics to compare pages (determine, which pages correspond to each other) + PC_Images = 0x0008, ///< Use images to compare pages (determine, which pages correspond to each other) + PC_Mesh = 0x0010, ///< Use mesh to compare pages (determine, which pages correspond to each other) }; Q_DECLARE_FLAGS(Options, Option) @@ -109,6 +115,7 @@ private: { StepExtractContentLeftDocument, StepExtractContentRightDocument, + StepMatchPages, StepExtractTextLeftDocument, StepExtractTextRightDocument, StepCompare, @@ -119,6 +126,7 @@ private: void stepProgress(); void performSteps(const std::vector& leftPages, const std::vector& rightPages); + void finalizeGraphicsPieces(PDFDiffPageContext& context); void onComparationPerformed(); diff --git a/Pdf4QtLib/sources/pdfpainter.cpp b/Pdf4QtLib/sources/pdfpainter.cpp index d1d8d7c..e7fea70 100644 --- a/Pdf4QtLib/sources/pdfpainter.cpp +++ b/Pdf4QtLib/sources/pdfpainter.cpp @@ -832,7 +832,8 @@ void PDFPrecompiledPage::finalize(qint64 compilingTimeNS, QList } } -PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceInfos(PDFReal epsilon) const +PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceInfos(QRectF mediaBox, + PDFReal epsilon) const { GraphicPieceInfos infos; @@ -850,6 +851,8 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI } PDFReal factor = 1.0 / epsilon; + QImage shadingTestImage; + // Process all instructions for (const Instruction& instruction : m_instructions) { @@ -903,33 +906,98 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI case InstructionType::DrawImage: { - /*const ImageData& data = m_images[instruction.dataIndex]; + const ImageData& data = m_images[instruction.dataIndex]; const QImage& image = data.image; - painter->save(); + GraphicPieceInfo info; + QByteArray serializedPath; - QMatrix imageTransform(1.0 / image.width(), 0, 0, 1.0 / image.height(), 0, 0); - QMatrix worldMatrix = imageTransform * painter->worldMatrix(); + // Serialize data + if (true) + { + QDataStream stream(&serializedPath, QIODevice::WriteOnly); - // Jakub Melka: Because Qt uses opposite axis direction than PDF, then we must transform the y-axis - // to the opposite (so the image is then unchanged) - worldMatrix.translate(0, image.height()); - worldMatrix.scale(1, -1); + // Jakub Melka: serialize image position + QMatrix worldMatrix = stateStack.top().matrix; - painter->setWorldMatrix(worldMatrix); - painter->drawImage(0, 0, image); - painter->restore();*/ + QPainterPath pagePath; + pagePath.addRect(0, 0, 1, 1); + pagePath = worldMatrix.map(pagePath); + + info.type = GraphicPieceInfo::Type::Image; + info.boundingRect = pagePath.controlPointRect(); + + const int elementCount = pagePath.elementCount(); + for (int i = 0; i < elementCount; ++i) + { + QPainterPath::Element element = pagePath.elementAt(i); + + PDFReal roundedX = qRound(element.x * factor); + PDFReal roundedY = qRound(element.y * factor); + + stream << roundedX; + stream << roundedY; + stream << element.type; + } + + // serialize image data + stream.writeBytes(reinterpret_cast(image.bits()), image.sizeInBytes()); + } + + QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512); + Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64); + + size_t size = qMin(hash.length(), info.hash.size()); + std::copy(hash.data(), hash.data() + size, info.hash.data()); + + infos.emplace_back(std::move(info)); break; } case InstructionType::DrawMesh: { - /*const MeshPaintData& data = m_meshes[instruction.dataIndex]; + const MeshPaintData& data = m_meshes[instruction.dataIndex]; - painter->save(); - painter->setWorldMatrix(pagePointToDevicePointMatrix); - data.mesh.paint(painter, data.alpha); - painter->restore();*/ + if (shadingTestImage.isNull()) + { + QSizeF mediaBoxSize = mediaBox.size(); + mediaBoxSize = mediaBoxSize.scaled(256, 256, Qt::KeepAspectRatio); + QSize imageSize = mediaBoxSize.toSize(); + shadingTestImage = QImage(imageSize, QImage::Format_ARGB32); + } + + shadingTestImage.fill(Qt::transparent); + + QMatrix pagePointToDevicePointMatrix; + pagePointToDevicePointMatrix.scale(shadingTestImage.width() / mediaBox.width(), -shadingTestImage.height() / mediaBox.height()); + + { + QPainter painter(&shadingTestImage); + painter.setWorldMatrix(pagePointToDevicePointMatrix); + data.mesh.paint(&painter, data.alpha); + } + + GraphicPieceInfo info; + QByteArray serializedMesh; + + // Serialize data + if (true) + { + QDataStream stream(&serializedMesh, QIODevice::WriteOnly); + + // serialize image data + stream.writeBytes(reinterpret_cast(shadingTestImage.bits()), shadingTestImage.sizeInBytes()); + } + + QByteArray hash = QCryptographicHash::hash(serializedMesh, QCryptographicHash::Sha512); + Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64); + + size_t size = qMin(hash.length(), info.hash.size()); + std::copy(hash.data(), hash.data() + size, info.hash.data()); + + info.boundingRect = QRectF(); + info.type = GraphicPieceInfo::Type::Shading; + infos.emplace_back(std::move(info)); break; } diff --git a/Pdf4QtLib/sources/pdfpainter.h b/Pdf4QtLib/sources/pdfpainter.h index ecf872a..26f85df 100644 --- a/Pdf4QtLib/sources/pdfpainter.h +++ b/Pdf4QtLib/sources/pdfpainter.h @@ -241,9 +241,20 @@ public: Unknown, Text, VectorGraphics, - Image + Image, + Shading }; + bool operator<(const GraphicPieceInfo& other) const + { + return std::tie(type, hash) < std::tie(other.type, other.hash); + } + + bool isText() const { return type == Type::Text; } + bool isVectorGraphics() const { return type == Type::VectorGraphics; } + bool isImage() const { return type == Type::Image; } + bool isShading() const { return type == Type::Shading; } + Type type = Type::Unknown; QRectF boundingRect; std::array hash = { }; @@ -255,8 +266,10 @@ public: /// for example, for comparation reasons. Parameter \p epsilon /// is for numerical precision - values under epsilon are considered /// as equal. + /// \param mediaBox Page's media box /// \param epsilon Epsilon - GraphicPieceInfos calculateGraphicPieceInfos(PDFReal epsilon) const; + GraphicPieceInfos calculateGraphicPieceInfos(QRectF mediaBox, + PDFReal epsilon) const; private: struct PathPaintData