对于我的项目,我需要用JS替换PDF中的文本。我取得了进展,但专业程序(如 Word 或 LibreWriter)的 PDF 输出没有任何可理解和可替换的文本。我需要做什么?
基本原始 PDF 文件(仅一部分): 4 0 对象 << /Length 132 >> 溪流 BT /F1 12 特夫 50 150 TD (这是一个简单的PDF)Tj 0 -20 时间 /F2 12 特夫 (这是粗体文字)Tj 0 -20 时间 /F3 12 特夫 (这是斜体文本)Tj ET 尾流 结束对象
在上面我们可以很容易地看到文本,但是Word的PDF等没有真正的文本。我需要知道如何解析文件。任何文章或帮助都会有帮助,非常感谢您的阅读。
我尝试用 fs 流解析和替换,使用常量文本是可能的,但输入来自其他文件或用户无法工作。这是基本代码:
const fs = require('fs');
const path = require('path');
var pdfData = null;
let parsedText = null;
let fontMap = {};
function readPDFFile(filePath) {
try {
pdfData = fs.readFileSync(filePath, 'utf8');
return pdfData;
} catch (err) {
console.error(`Could not read the PDF file: ${err.message}`);
return null;
}
}
function parsePDF() {
if (!pdfData) {
console.error('No PDF data available.');
return null;
}
const lines = pdfData.split('\n');
const text = [];
fontMap = {};
let inStream = false;
let currentFont = null;
lines.forEach(line => {
line = line.trim();
if (line === "stream") {
inStream = true;
} else if (line === "endstream") {
inStream = false;
} else if (line.startsWith('/BaseFont')) {
const match = /\/BaseFont\s\/(\w+)(?:-Bold|-Oblique)?/.exec(line);
if (match) {
currentFont = match[1];
fontMap[currentFont] = match[0];
}
} else if (inStream) {
const match = /\((.*)\) Tj/.exec(line);
if (match) {
text.push({ text: match[1], font: currentFont });
}
}
});
parsedText = text;
return parsedText;
}
function processParsedText(style) {
if (!parsedText) {
console.error('No parsed text available.');
return null;
}
const styleMap = {
Italic: '-Oblique',
Bold: '-Bold'
};
if (!styleMap[style]) {
console.error('Invalid style provided.');
return null;
}
parsedText = parsedText.map(item => {
if (item.font) {
item.font = `${item.font}${styleMap[style]}`;
}
return item;
});
return parsedText;
}
function replace(parsedText, newText) {
if (!parsedText) {
console.error('No parsed text provided.');
return null;
}
if (!newText || newText.length !== parsedText.length) {
console.error('Invalid new text provided.');
return null;
}
const newContent = parsedText.map((item, index) => ({
text: newText[index],
font: item.font
}));
let updatedPDF = pdfData;
parsedText.forEach((item, index) => {
const regex = new RegExp(`\\(${item.text}\\) Tj`, 'g');
updatedPDF = updatedPDF.replace(regex, `(${newContent[index].text}) Tj`);
});
pdfData = updatedPDF;
return newContent;
}
function saveToFile(filename) {
if (!pdfData || !parsedText) {
console.error('No data available.');
return;
}
const combinedContent = `${pdfData}`;
fs.writeFileSync(filename, combinedContent, 'utf8');
}
function processPDF(inputFilePath, outputFilePath) {
readPDFFile(inputFilePath);
parsePDF();
saveToFile(outputFilePath);
}
module.exports = {
readPDFFile,
parsePDF,
processParsedText,
replace,
saveToFile,
processPDF,
getPdfData: () => pdfData,
getParsedText: () => parsedText
};
要使用 JavaScript 将 PDF 解压缩为纯文本(例如 ANSI 或 JSON),请考虑如何使用连贯的 cpdf JS API。
我使用命令行方法来完成此类任务,因为一次性编辑更容易。但对于可编程方法,您需要 API 库。
作为示例,编码格式的 text.pdf 文件(我很清楚)将如下所示。
2 0 obj
<</Filter/FlateDecode/Length 361>>
stream
xM‘Ñn‚0†ïû'Ù…xQjQ”-ñÂeš]˜ÌL^ Ð"Ý€’Ò§ bB_Ïéþóóž!Y, •°Â÷ž@ZàT ݤêê`¥síc…Q¶3u˜›š5ʱOÓ§†Iáû2'YŒ÷WQ·•
[YÌgÄËQî¦ð RΡN8ŸAäOIðê?7I¦:¡xêƒïˆÇŽ@ ðÍ3$O-' tÝ(ÃRœL@èò®—#=ô$B´ï5‹×èãYž/ÆhÅEn@4Æ•ÊÂéã0mµ«:ìð)‚›bì0GÌÆd?*wC’H¬B£Ë¶õÁ½xÝm4îœ}ˆÜÿŽÊ|589ë‹+á¬ò?«À•ºƒBcðG‰„7½v%ì¤É|+!•_€Æ@Gc!¯tþû0Ö÷}(|÷`LäÖdÂ1ÓTºQ¬Ã‘Ý…¥«+ÔÙ§$BCëMÅÀÇ*r&ÿ#ɤÅ
endstream
endobj
但作为可编辑文本(
cpdf -decompress -no-preserve-objstm out.pdf -o text.pdf
),它可以被视为。
10 0 obj
<</Length 588>>
stream
BT 15 800 Td 40 TL/1 18 Tf(PlainText = http://foersom.com/net/HowTo/data/OoPdfFormExample.pdf)'/1 16 Tf 40 TL(12 letters.)' 20 TL(9)' 27 20 Td(letters.)' -27 0 Td(A)' 25 20 Td(a)' 17 20 Td(a)' 18 20 Td(a)' 23 20 Td(a)' -72 20 Td(b)' 58 20 Td(b)' -35 20 Td(c)' 17 20 Td(d)' 26.5 20 Td(r)' -58 20 Td(r)' 40 TL(goto page in another PDF)' 20 TL(Also plain text = https://pdfobject.com/pdf/sample-3pp.pdf#page=2)' 25 TL/1 11 Tf 120 140 Td(Sign and Secure this file.txt as a PDF with Adobe Reader)' -5 -40 Td(or click https://www.adobe.com/acrobat/online/sign-pdf.html)' ET 220 680 150 50 re S
endstream
endobj
或 JSON (
cpdf in.pdf -output-json -utf8 -output-json-parse-content-streams -o out.json
)。
{
"S": [
{},
[
[ "BT" ],
[ { "F": 15.0 }, { "F": 800.0 }, "Td" ],
[ { "F": 40.0 }, "TL" ],
[ "/1", { "F": 18.0 }, "Tf" ],
[
"PlainText = http://foersom.com/net/HowTo/data/OoPdfFormExample.pdf",
"'"
],
[ "/1", { "F": 16.0 }, "Tf" ],
[ { "F": 40.0 }, "TL" ],
[ "12 letters.", "'" ],
[ { "F": 20.0 }, "TL" ],
[ "9", "'" ],
[ { "F": 27.0 }, { "F": 20.0 }, "Td" ],
[ "letters.", "'" ],
[ { "F": -27.0 }, { "F": 0.0 }, "Td" ],
[ "A", "'" ],
[ { "F": 25.0 }, { "F": 20.0 }, "Td" ],
[ "a", "'" ],
[ { "F": 17.0 }, { "F": 20.0 }, "Td" ],
[ "a", "'" ],
[ { "F": 18.0 }, { "F": 20.0 }, "Td" ],
[ "a", "'" ],
[ { "F": 23.0 }, { "F": 20.0 }, "Td" ],
[ "a", "'" ],
[ { "F": -72.0 }, { "F": 20.0 }, "Td" ],
[ "b", "'" ],
[ { "F": 58.0 }, { "F": 20.0 }, "Td" ],
[ "b", "'" ],
[ { "F": -35.0 }, { "F": 20.0 }, "Td" ],
[ "c", "'" ],
[ { "F": 17.0 }, { "F": 20.0 }, "Td" ],
[ "d", "'" ],
[ { "F": 26.5 }, { "F": 20.0 }, "Td" ],
[ "r", "'" ],
[ { "F": -58.0 }, { "F": 20.0 }, "Td" ],
[ "r", "'" ],
[ { "F": 40.0 }, "TL" ],
[ "goto page in another PDF", "'" ],
[ { "F": 20.0 }, "TL" ],
[
"Also plain text = https://pdfobject.com/pdf/sample-3pp.pdf#page=2",
"'"
],
[ { "F": 25.0 }, "TL" ],
[ "/1", { "F": 11.0 }, "Tf" ],
[ { "F": 120.0 }, { "F": 140.0 }, "Td" ],
[ "Sign and Secure this file.txt as a PDF with Adobe Reader", "'" ],
[ { "F": -5.0 }, { "F": -40.0 }, "Td" ],
[
"or click https://www.adobe.com/acrobat/online/sign-pdf.html",
"'"
],
[ "ET" ],
[
{ "F": 220.0 },
{ "F": 680.0 },
{ "F": 150.0 },
{ "F": 50.0 },
"re"
],
[ "S" ]
]
]
因此文本更容易编辑(当仔细完成时),但是
JSON (
cpdf -j in.json -o out.pdf
) 或 TXT (cpdf pdf.txt -o text2.pdf
)