突出显示现有 PDF 中的单词

Question

我需要在给定我已经提取的特定坐标的情况下突出显示现有 PDF 中的一组单词。我正在使用 Apache 的 pdfbox（最新版本 2.0.8）。我可以使用一个示例文件来实现此目的（pdfbox 网站内的 AddAnnotations.java），但我认为该示例是使用较旧的 Java 版本编译的，因为以下导入不起作用：

import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationHighlight;

有人可以帮我吗？使用此库突出显示单词的最简单方法是什么？

Answer 1

这里是突出显示 PDF 文档中所有单词的代码。通过修改此脚本可以轻松地仅突出显示一组特定的单词。请注意，这只是一个测试，需要对以新行结尾的单词以及放置在负面横向/纵向 PDF 页面中的单词进行进一步检查。优化这个脚本也是可能的。

此脚本是使用 Apache PDFBox 2.0.8 构建的。

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;

public class TestAnnotatePDF extends PDFTextStripper
{
    static List<double[]> coordinates;
    static ArrayList tokenStream;

    public TestAnnotatePDF() throws IOException
    {
        //data structed containing coordinates information for each token
        coordinates = new ArrayList<>();

        //List of words extracted from text (considering a whitespace-based tokenization)
        tokenStream = new ArrayList();
    }

    public static void main(String [] args) throws IOException
    {

        try
        {   
           //Loading an existing document
           File file = new File("MyDocument");
           PDDocument document = PDDocument.load(file);

           //extended PDFTextStripper class
           PDFTextStripper stripper = new TestAnnotatePDF();

           //Get number of pages
           int number_of_pages = document.getDocumentCatalog().getPages().getCount();

           //The method writeText will invoke an override version of writeString
           Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
           stripper.writeText(document, dummy);

           //Print collected information
           System.out.println(tokenStream);
           System.out.println(tokenStream.size());
           System.out.println(coordinates.size());

           double page_height;
           double page_width;
           double width, height, minx, maxx, miny, maxy;
           int rotation;

           //scan each page and highlitht all the words inside them
           for (int page_index = 0; page_index < number_of_pages; page_index++)
           {   
               //get current page
               PDPage page = document.getPage(page_index);

               //Get annotations for the selected page
               List<PDAnnotation> annotations = page.getAnnotations();

               //Define a color to use for highlighting text
               PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);

               //Page height and width
               page_height = page.getMediaBox().getHeight();
               page_width  = page.getMediaBox().getWidth();

               //Scan collected coordinates
               for (int i=0; i<coordinates.size(); i++)
                  {
                   //if the current coordinates are not related to the current
                   //page, ignore them
                   if ((int) coordinates.get(i)[4] != (page_index+1))
                      continue;
                   else
                   {
                       //get rotation of the page...portrait..landscape..
                       rotation = (int) coordinates.get(i)[7];

                       //page rotated of 90degrees
                       if (rotation == 90)
                       {
                           height = coordinates.get(i)[5];
                           width = coordinates.get(i)[6];
                           width = (page_height * width)/page_width;

                           //define coordinates of a rectangle
                           maxx = coordinates.get(i)[1];
                           minx = coordinates.get(i)[1] - height;
                           miny = coordinates.get(i)[0];
                           maxy = coordinates.get(i)[0] + width;
                       }
                       else //i should add here the cases -90/-180 degrees
                       {
                           height = coordinates.get(i)[5];
                           minx = coordinates.get(i)[0];
                           maxx = coordinates.get(i)[2];
                           miny = page_height - coordinates.get(i)[1];
                           maxy = page_height - coordinates.get(i)[3] + height;
                       }

                       //Add an annotation for each scanned word
                       PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
                       txtMark.setColor(red);
                       txtMark.setConstantOpacity((float)0.3); // 30% transparent
                       PDRectangle position = new PDRectangle();
                       position.setLowerLeftX((float) minx);
                       position.setLowerLeftY((float) miny);
                       position.setUpperRightX((float) maxx);
                       position.setUpperRightY((float) ((float) maxy+height));
                       txtMark.setRectangle(position);

                       float[] quads = new float[8];
                       quads[0] = position.getLowerLeftX();  // x1
                       quads[1] = position.getUpperRightY()-2; // y1
                       quads[2] = position.getUpperRightX(); // x2
                       quads[3] = quads[1]; // y2
                       quads[4] = quads[0];  // x3
                       quads[5] = position.getLowerLeftY()-2; // y3
                       quads[6] = quads[2]; // x4
                       quads[7] = quads[5]; // y5
                       txtMark.setQuadPoints(quads);
                       txtMark.setContents(tokenStream.get(i).toString());
                       annotations.add(txtMark);
                   }    
               }
           }

           //Saving the document in a new file
           File highlighted_doc = new File("MyDocument_final.pdf");
           document.save(highlighted_doc);

        document.close();
    }
    catch(IOException e)
    {
        System.out.println(e);
    }

}

@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{ 
    String token = "";
    int token_length = textPositions.size();
    int counter = 1;
    double minx = 0,maxx = 0,miny = 0,maxy =0; 
    double height = 0;
    double width = 0;
    int rotation = 0;

    for (TextPosition text : textPositions)
    {          
        rotation = text.getRotation();

        if (text.getHeight() > height)
            height = text.getHeight(); 

        if (text.getWidth() > width)
            width = text.getWidth();

        //if it is the first char of the current word
        if (counter == 1)
        {
            minx = text.getX();
            miny = text.getY();
        }

        //if it is the last char of the current word
        if (counter == token_length)
        {
            maxx = text.getEndX();
            maxy = text.getY();
        }

        token += text;
        counter += 1;

    }

    tokenStream.add(token);
    double word_coordinates [] = {minx,miny,maxx,maxy,this.getCurrentPageNo(), height, width, rotation};
    coordinates.add(word_coordinates);
}}

Answer 2

这里是突出显示 PDF 文档中特定单词的代码。请注意，这适用于突出显示搜索文本的行。 突出显示 PDF 中的特定单词仍在进行中... 任何在此代码之上突出显示特定单词的建议都将受到高度赞赏。

此脚本是使用 Apache PDFBox 2.0.8 构建的

    import java.io.ByteArrayOutputStream;
    import java.io.File;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    import java.io.Writer;
    import java.util.List;

    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.common.PDRectangle;
    import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
    import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
    import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
    import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.pdfbox.text.TextPosition;

    public class PDFhighlightDemo extends PDFTextStripper {

        public PDFhighlightDemo()  throws IOException {
            super();
        }

        public static void main(String[] args)  throws IOException {
            PDDocument document = null;
            String fileName = "Demo1.pdf";
            try {
                document = PDDocument.load( new File(fileName) );
                PDFTextStripper stripper = new PDFhighlightDemo();
                stripper.setSortByPosition( true );

                stripper.setStartPage( 0 );
                stripper.setEndPage( document.getNumberOfPages() );

                Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
                stripper.writeText(document, dummy);

                File file1 = new File("FinalPDF.pdf");
                document.save(file1);
            }
            finally {
                if( document != null ) {
                    document.close();
                }
            }
        }

        /**
         * Override the default functionality of PDFTextStripper.writeString()
         */

        @Override
        protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
            boolean isFound = false;

            float posXInit1  = 0, 
                    posXEnd1   = 0, 
                    posYInit1  = 0,
                    posYEnd1   = 0,
                    width1     = 0, 
                    height1    = 0, 
                    fontHeight1 = 0;

            String[] criteria = {"angular", "prepared"};

            for (int i = 0; i < criteria.length; i++) {
                if (string.contains(criteria[i])) {
                    isFound = true;
                } 
            }
            if (isFound) {

                for(TextPosition textPosition:textPositions) {

                  posXInit1 = textPositions.get(0).getXDirAdj(); 
                  posXEnd1  = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth();
                  posYInit1 = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj();
                  posYEnd1  = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getYDirAdj();
                  width1    = textPositions.get(0).getWidthDirAdj();
                  height1   = textPositions.get(0).getHeightDir();

                }


                float quadPoints[] = {posXInit1, posYEnd1 + height1 + 2, posXEnd1, posYEnd1 + height1 + 2, posXInit1, posYInit1 - 2, posXEnd1, posYEnd1 - 2};

                List<PDAnnotation> annotations = document.getPage(this.getCurrentPageNo() - 1).getAnnotations();
                PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);

                PDRectangle position = new PDRectangle();
                position.setLowerLeftX(posXInit1);
                position.setLowerLeftY(posYEnd1);
                position.setUpperRightX(posXEnd1);
                position.setUpperRightY(posYEnd1 + height1);

                highlight.setRectangle(position);

                // quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right) 
                // of the area to be highlighted

                highlight.setQuadPoints(quadPoints);

                PDColor yellow = new PDColor(new float[]{1, 1, 1 / 255F}, PDDeviceRGB.INSTANCE);
                highlight.setColor(yellow);
                annotations.add(highlight);
            }
        }

    }

Answer 3

非常感谢上面的答案！我的解决方案基于 PDFBox 的答案。

我做了一点修改，这样它只突出显示特定的单词，而不是整行。您必须调整开始和结束位置。我摆脱了 isFound boolean 并将其余代码移至 if 块内。然后添加两个变量，startPosition和endPosition。这是修改后的代码片段：

int startPosition = string.indexOf(criteria[i])
int endPosition = startPosition + criteria[i].length()
                
posXInit = textPositions.get(startPosition).getXDirAdj();
posXEnd = textPositions.get(endPosition - 1).getXDirAdj() + textPositions.get(endPosition - 1).getWidth();
posYInit = textPositions.get(startPosition).getPageHeight() - textPositions.get(startPosition).getYDirAdj();
posYEnd = textPositions.get(startPosition).getPageHeight() - textPositions.get(endPosition - 1).getYDirAdj();
width = textPositions.get(startPosition).getWidthDirAdj();
height = textPositions.get(startPosition).getHeightDir();

我使用groovy，所以有一点不同，但这是整个功能：

@Override
public void writeString(String string, List<TextPosition> textPositions) throws IOException {
    float posXInit = 0
    float posXEnd = 0
    float posYInit = 0
    float posYEnd = 0
    float width = 0
    float height = 0 
    float fontHeight = 0
    String[] criteria = ["Word2", "Word5"];

    for (int i = 0; i < criteria.length; i++) {
        if (string.contains(criteria[i])) {
            int startPosition = string.indexOf(criteria[i])
            int endPosition = startPosition + criteria[i].length()
            
            posXInit = textPositions.get(startPosition).getXDirAdj();
            posXEnd = textPositions.get(endPosition - 1).getXDirAdj() + textPositions.get(endPosition - 1).getWidth();
            posYInit = textPositions.get(startPosition).getPageHeight() - textPositions.get(startPosition).getYDirAdj();
            posYEnd = textPositions.get(startPosition).getPageHeight() - textPositions.get(endPosition - 1).getYDirAdj();
            width = textPositions.get(startPosition).getWidthDirAdj();
            height = textPositions.get(startPosition).getHeightDir();

            println(string + "X-Init = " + posXInit + "; Y-Init = " + posYInit + "; X-End = " + posXEnd + "; Y-End = " + posYEnd + "; Font-Height = " + fontHeight);

            /* numeration is index-based. Starts from 0 */

            float[] quadPoints = [posXInit, posYEnd + height + 2, posXEnd, posYEnd + height + 2, posXInit, posYInit - 2, posXEnd, posYEnd - 2];

            List<PDAnnotation> annotations = document.getPage(this.getCurrentPageNo() - 1).getAnnotations();
            PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup("Highlight");

            PDRectangle position = new PDRectangle();
            position.setLowerLeftX((float) posXInit);
            position.setLowerLeftY((float) posYEnd);
            position.setUpperRightX((float) posXEnd);
            position.setUpperRightY((float) (posYEnd +  height));

            highlight.setRectangle(position);

            // quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
            // of the area to be highlighted

            highlight.setQuadPoints(quadPoints);
            float[] components = [ (float) 1, (float) 1, (float) (100 / 255)]
            PDColor yellow = new PDColor(components, PDDeviceRGB.INSTANCE);
            highlight.setColor(yellow);
            annotations.add(highlight);
        } 
    }
}

最终文档看起来像this

Answer 4

使用 PDFclown 突出显示文档中的特定单词。

package com.NLP.demo;

import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.tools.TextExtractor;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;

public class PDFCrownDemo  {

    public static void main() throws IOException {
        PDFCrownDemo PDFCrownDemo=new PDFCrownDemo();
        PDFCrownDemo.highlighttext();

    }

    public void highlighttext() throws IOException{

        org.pdfclown.files.File file = new org.pdfclown.files.File("src/main/resources/XXX.pdf");   
        String textRegEx = "Contract";
        Pattern pattern = Pattern.compile(textRegEx, Pattern.CASE_INSENSITIVE);

        TextExtractor textExtractor = new TextExtractor(true, true);

        for(final Page page : file.getDocument().getPages())
        {
          Map<Rectangle2D,List<ITextString>> textStrings = textExtractor.extract(page);
          final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
         textExtractor.filter(textStrings,new TextExtractor.IIntervalFilter()
            {
              @Override
              public boolean hasNext()
              {return matcher.find();}

              @Override
              public Interval next()
              {return new Interval(matcher.start(), matcher.end());}

              @Override
              public void process(Interval interval,ITextString match)
              {
                // Defining the highlight box of the text pattern match...
                List highlightQuads = new ArrayList();
                {
                  /*
                    NOTE: A text pattern match may be split across multiple contiguous lines,
                    so we have to define a distinct highlight box for each text chunk.
                  */
                  Rectangle2D textBox = null;
                  for(TextChar textChar : match.getTextChars())
                  {
                    Rectangle2D textCharBox = textChar.getBox();
                    if(textBox == null)
                    {textBox = (Rectangle2D)textCharBox.clone();}
                    else
                    {
                      if(textCharBox.getY() > textBox.getMaxY())
                      {
                        highlightQuads.add(Quad.get(textBox));
                        textBox = (Rectangle2D)textCharBox.clone();
                      }
                      else
                      {textBox.add(textCharBox);}
                    }
                  }
                  highlightQuads.add(Quad.get(textBox));
                }
                // Highlight the text pattern match!
                new TextMarkup(page,MarkupTypeEnum.Highlight, highlightQuads);
              }

              @Override
              public void remove(
                )
              {throw new UnsupportedOperationException();}
            }
            );
        }

        //file.save(SerializationModeEnum.Incremental);
        file.save(new java.io.File("src/main/resources/XXX.pdf"), SerializationModeEnum.Standard);
    }

}

突出显示现有 PDF 中的单词

问题描述投票：0回答：4

4个回答

最新问题

突出显示现有 PDF 中的单词

问题描述 投票：0回答：4

4个回答

最新问题

问题描述投票：0回答：4