我需要在给定我已经提取的特定坐标的情况下突出显示现有 PDF 中的一组单词。 我正在使用 Apache 的 pdfbox(最新版本 2.0.8)。 我可以使用一个示例文件来实现此目的(pdfbox 网站内的 AddAnnotations.java),但我认为该示例是使用较旧的 Java 版本编译的,因为以下导入不起作用:
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationHighlight;
有人可以帮我吗?使用此库突出显示单词的最简单方法是什么?
这里是突出显示 PDF 文档中所有单词的代码。通过修改此脚本可以轻松地仅突出显示一组特定的单词。请注意,这只是一个测试,需要对以新行结尾的单词以及放置在负面横向/纵向 PDF 页面中的单词进行进一步检查。优化这个脚本也是可能的。
此脚本是使用 Apache PDFBox 2.0.8 构建的。
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
public class TestAnnotatePDF extends PDFTextStripper
{
static List<double[]> coordinates;
static ArrayList tokenStream;
public TestAnnotatePDF() throws IOException
{
//data structed containing coordinates information for each token
coordinates = new ArrayList<>();
//List of words extracted from text (considering a whitespace-based tokenization)
tokenStream = new ArrayList();
}
public static void main(String [] args) throws IOException
{
try
{
//Loading an existing document
File file = new File("MyDocument");
PDDocument document = PDDocument.load(file);
//extended PDFTextStripper class
PDFTextStripper stripper = new TestAnnotatePDF();
//Get number of pages
int number_of_pages = document.getDocumentCatalog().getPages().getCount();
//The method writeText will invoke an override version of writeString
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
//Print collected information
System.out.println(tokenStream);
System.out.println(tokenStream.size());
System.out.println(coordinates.size());
double page_height;
double page_width;
double width, height, minx, maxx, miny, maxy;
int rotation;
//scan each page and highlitht all the words inside them
for (int page_index = 0; page_index < number_of_pages; page_index++)
{
//get current page
PDPage page = document.getPage(page_index);
//Get annotations for the selected page
List<PDAnnotation> annotations = page.getAnnotations();
//Define a color to use for highlighting text
PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);
//Page height and width
page_height = page.getMediaBox().getHeight();
page_width = page.getMediaBox().getWidth();
//Scan collected coordinates
for (int i=0; i<coordinates.size(); i++)
{
//if the current coordinates are not related to the current
//page, ignore them
if ((int) coordinates.get(i)[4] != (page_index+1))
continue;
else
{
//get rotation of the page...portrait..landscape..
rotation = (int) coordinates.get(i)[7];
//page rotated of 90degrees
if (rotation == 90)
{
height = coordinates.get(i)[5];
width = coordinates.get(i)[6];
width = (page_height * width)/page_width;
//define coordinates of a rectangle
maxx = coordinates.get(i)[1];
minx = coordinates.get(i)[1] - height;
miny = coordinates.get(i)[0];
maxy = coordinates.get(i)[0] + width;
}
else //i should add here the cases -90/-180 degrees
{
height = coordinates.get(i)[5];
minx = coordinates.get(i)[0];
maxx = coordinates.get(i)[2];
miny = page_height - coordinates.get(i)[1];
maxy = page_height - coordinates.get(i)[3] + height;
}
//Add an annotation for each scanned word
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
txtMark.setColor(red);
txtMark.setConstantOpacity((float)0.3); // 30% transparent
PDRectangle position = new PDRectangle();
position.setLowerLeftX((float) minx);
position.setLowerLeftY((float) miny);
position.setUpperRightX((float) maxx);
position.setUpperRightY((float) ((float) maxy+height));
txtMark.setRectangle(position);
float[] quads = new float[8];
quads[0] = position.getLowerLeftX(); // x1
quads[1] = position.getUpperRightY()-2; // y1
quads[2] = position.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = position.getLowerLeftY()-2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
txtMark.setContents(tokenStream.get(i).toString());
annotations.add(txtMark);
}
}
}
//Saving the document in a new file
File highlighted_doc = new File("MyDocument_final.pdf");
document.save(highlighted_doc);
document.close();
}
catch(IOException e)
{
System.out.println(e);
}
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{
String token = "";
int token_length = textPositions.size();
int counter = 1;
double minx = 0,maxx = 0,miny = 0,maxy =0;
double height = 0;
double width = 0;
int rotation = 0;
for (TextPosition text : textPositions)
{
rotation = text.getRotation();
if (text.getHeight() > height)
height = text.getHeight();
if (text.getWidth() > width)
width = text.getWidth();
//if it is the first char of the current word
if (counter == 1)
{
minx = text.getX();
miny = text.getY();
}
//if it is the last char of the current word
if (counter == token_length)
{
maxx = text.getEndX();
maxy = text.getY();
}
token += text;
counter += 1;
}
tokenStream.add(token);
double word_coordinates [] = {minx,miny,maxx,maxy,this.getCurrentPageNo(), height, width, rotation};
coordinates.add(word_coordinates);
}}
这里是突出显示 PDF 文档中特定单词的代码。请注意,这适用于突出显示搜索文本的行。 突出显示 PDF 中的特定单词仍在进行中... 任何在此代码之上突出显示特定单词的建议都将受到高度赞赏。
此脚本是使用 Apache PDFBox 2.0.8 构建的
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
public class PDFhighlightDemo extends PDFTextStripper {
public PDFhighlightDemo() throws IOException {
super();
}
public static void main(String[] args) throws IOException {
PDDocument document = null;
String fileName = "Demo1.pdf";
try {
document = PDDocument.load( new File(fileName) );
PDFTextStripper stripper = new PDFhighlightDemo();
stripper.setSortByPosition( true );
stripper.setStartPage( 0 );
stripper.setEndPage( document.getNumberOfPages() );
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
File file1 = new File("FinalPDF.pdf");
document.save(file1);
}
finally {
if( document != null ) {
document.close();
}
}
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*/
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
boolean isFound = false;
float posXInit1 = 0,
posXEnd1 = 0,
posYInit1 = 0,
posYEnd1 = 0,
width1 = 0,
height1 = 0,
fontHeight1 = 0;
String[] criteria = {"angular", "prepared"};
for (int i = 0; i < criteria.length; i++) {
if (string.contains(criteria[i])) {
isFound = true;
}
}
if (isFound) {
for(TextPosition textPosition:textPositions) {
posXInit1 = textPositions.get(0).getXDirAdj();
posXEnd1 = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth();
posYInit1 = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj();
posYEnd1 = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getYDirAdj();
width1 = textPositions.get(0).getWidthDirAdj();
height1 = textPositions.get(0).getHeightDir();
}
float quadPoints[] = {posXInit1, posYEnd1 + height1 + 2, posXEnd1, posYEnd1 + height1 + 2, posXInit1, posYInit1 - 2, posXEnd1, posYEnd1 - 2};
List<PDAnnotation> annotations = document.getPage(this.getCurrentPageNo() - 1).getAnnotations();
PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
PDRectangle position = new PDRectangle();
position.setLowerLeftX(posXInit1);
position.setLowerLeftY(posYEnd1);
position.setUpperRightX(posXEnd1);
position.setUpperRightY(posYEnd1 + height1);
highlight.setRectangle(position);
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
// of the area to be highlighted
highlight.setQuadPoints(quadPoints);
PDColor yellow = new PDColor(new float[]{1, 1, 1 / 255F}, PDDeviceRGB.INSTANCE);
highlight.setColor(yellow);
annotations.add(highlight);
}
}
}
非常感谢上面的答案! 我的解决方案基于 PDFBox 的答案。
我做了一点修改,这样它只突出显示特定的单词,而不是整行。 您必须调整开始和结束位置。我摆脱了 isFound boolean 并将其余代码移至 if 块内。 然后添加两个变量,startPosition和endPosition。这是修改后的代码片段:
int startPosition = string.indexOf(criteria[i])
int endPosition = startPosition + criteria[i].length()
posXInit = textPositions.get(startPosition).getXDirAdj();
posXEnd = textPositions.get(endPosition - 1).getXDirAdj() + textPositions.get(endPosition - 1).getWidth();
posYInit = textPositions.get(startPosition).getPageHeight() - textPositions.get(startPosition).getYDirAdj();
posYEnd = textPositions.get(startPosition).getPageHeight() - textPositions.get(endPosition - 1).getYDirAdj();
width = textPositions.get(startPosition).getWidthDirAdj();
height = textPositions.get(startPosition).getHeightDir();
我使用groovy,所以有一点不同,但这是整个功能:
@Override
public void writeString(String string, List<TextPosition> textPositions) throws IOException {
float posXInit = 0
float posXEnd = 0
float posYInit = 0
float posYEnd = 0
float width = 0
float height = 0
float fontHeight = 0
String[] criteria = ["Word2", "Word5"];
for (int i = 0; i < criteria.length; i++) {
if (string.contains(criteria[i])) {
int startPosition = string.indexOf(criteria[i])
int endPosition = startPosition + criteria[i].length()
posXInit = textPositions.get(startPosition).getXDirAdj();
posXEnd = textPositions.get(endPosition - 1).getXDirAdj() + textPositions.get(endPosition - 1).getWidth();
posYInit = textPositions.get(startPosition).getPageHeight() - textPositions.get(startPosition).getYDirAdj();
posYEnd = textPositions.get(startPosition).getPageHeight() - textPositions.get(endPosition - 1).getYDirAdj();
width = textPositions.get(startPosition).getWidthDirAdj();
height = textPositions.get(startPosition).getHeightDir();
println(string + "X-Init = " + posXInit + "; Y-Init = " + posYInit + "; X-End = " + posXEnd + "; Y-End = " + posYEnd + "; Font-Height = " + fontHeight);
/* numeration is index-based. Starts from 0 */
float[] quadPoints = [posXInit, posYEnd + height + 2, posXEnd, posYEnd + height + 2, posXInit, posYInit - 2, posXEnd, posYEnd - 2];
List<PDAnnotation> annotations = document.getPage(this.getCurrentPageNo() - 1).getAnnotations();
PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup("Highlight");
PDRectangle position = new PDRectangle();
position.setLowerLeftX((float) posXInit);
position.setLowerLeftY((float) posYEnd);
position.setUpperRightX((float) posXEnd);
position.setUpperRightY((float) (posYEnd + height));
highlight.setRectangle(position);
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
// of the area to be highlighted
highlight.setQuadPoints(quadPoints);
float[] components = [ (float) 1, (float) 1, (float) (100 / 255)]
PDColor yellow = new PDColor(components, PDDeviceRGB.INSTANCE);
highlight.setColor(yellow);
annotations.add(highlight);
}
}
}
最终文档看起来像this
使用 PDFclown 突出显示文档中的特定单词。
package com.NLP.demo;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.tools.TextExtractor;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
public class PDFCrownDemo {
public static void main() throws IOException {
PDFCrownDemo PDFCrownDemo=new PDFCrownDemo();
PDFCrownDemo.highlighttext();
}
public void highlighttext() throws IOException{
org.pdfclown.files.File file = new org.pdfclown.files.File("src/main/resources/XXX.pdf");
String textRegEx = "Contract";
Pattern pattern = Pattern.compile(textRegEx, Pattern.CASE_INSENSITIVE);
TextExtractor textExtractor = new TextExtractor(true, true);
for(final Page page : file.getDocument().getPages())
{
Map<Rectangle2D,List<ITextString>> textStrings = textExtractor.extract(page);
final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
textExtractor.filter(textStrings,new TextExtractor.IIntervalFilter()
{
@Override
public boolean hasNext()
{return matcher.find();}
@Override
public Interval next()
{return new Interval(matcher.start(), matcher.end());}
@Override
public void process(Interval interval,ITextString match)
{
// Defining the highlight box of the text pattern match...
List highlightQuads = new ArrayList();
{
/*
NOTE: A text pattern match may be split across multiple contiguous lines,
so we have to define a distinct highlight box for each text chunk.
*/
Rectangle2D textBox = null;
for(TextChar textChar : match.getTextChars())
{
Rectangle2D textCharBox = textChar.getBox();
if(textBox == null)
{textBox = (Rectangle2D)textCharBox.clone();}
else
{
if(textCharBox.getY() > textBox.getMaxY())
{
highlightQuads.add(Quad.get(textBox));
textBox = (Rectangle2D)textCharBox.clone();
}
else
{textBox.add(textCharBox);}
}
}
highlightQuads.add(Quad.get(textBox));
}
// Highlight the text pattern match!
new TextMarkup(page,MarkupTypeEnum.Highlight, highlightQuads);
}
@Override
public void remove(
)
{throw new UnsupportedOperationException();}
}
);
}
//file.save(SerializationModeEnum.Incremental);
file.save(new java.io.File("src/main/resources/XXX.pdf"), SerializationModeEnum.Standard);
}
}