我正在使用 pdf.js 库从 PDF 文件中提取文本,但提取的文本格式不正确,有些行在末尾结束。 PDF 文件通常包含一份简历,由于不同的简历可能有不同的布局和文字结构,我如何将解析的文本分割成不同的部分,如介绍、教育和经验?
这是我将 pdf 解析为文本格式的代码
import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";
function PDFParser() {
const [extractedText, setExtractedText] = useState("");
const [pdfSrc, setPdfSrc] = useState(null);
const [selectedFileName, setSelectedFileName] = useState("");
const fileInputRef = useRef(null);
const handleFileChange = async (event) => {
const selectedFile = event.target.files[0];
if (!selectedFile) {
return;
}
const fileReader = new FileReader();
fileReader.onload = async () => {
const arrayBuffer = fileReader.result;
try {
pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
const numPages = pdf.numPages;
let extractedText = "";
for (let i = 1; i <= numPages; i++) {
const page = await pdf.getPage(i);
const pageText = await page.getTextContent();
// Map over text items and join them with a newline character
const pageLines = pageText.items.map((item) => item.str).join("\n");
// Append the lines from this page to the extracted text
if (extractedText !== "") {
extractedText += "\n";
}
extractedText += pageLines;
}
setExtractedText(extractedText);
setPdfSrc(URL.createObjectURL(selectedFile));
setSelectedFileName(selectedFile.name);
} catch (error) {
console.error("Error parsing PDF:", error);
}
};
setExtractedText("");
fileReader.readAsArrayBuffer(selectedFile);
};
return (
<div>
<input
type="file"
onChange={handleFileChange}
accept=".pdf"
ref={fileInputRef}
style={{ display: "none" }}
/>
<button className="UploadButton" onClick={openFileDialog}>
Upload PDF
</button>
<div className="ScrollableContainer">
{extractedText && (
<HTMLContent text={extractedText}/>
)}
</div>
</div>
);
}
我尝试将其转换为 html,但 pdfjs-dist 不允许将其正确转换为 html
那么有人可以建议我解析文本的其他方法或建议一些帮助我做到这一点的库
import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";
function PDFParser() {
const [extractedText, setExtractedText] = useState("");
const [pdfSrc, setPdfSrc] = useState(null);
const [selectedFileName, setSelectedFileName] = useState("");
const fileInputRef = useRef(null);
const handleFileChange = async (event) => {
const selectedFile = event.target.files[0];
if (!selectedFile) {
return;
}
const fileReader = new FileReader();
fileReader.onload = async () => {
const arrayBuffer = fileReader.result;
try {
pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
const numPages = pdf.numPages;
let extractedText = "";
for (let i = 1; i <= numPages; i++) {
const page = await pdf.getPage(i);
const pageText = await page.getTextContent();
// Map over text items and join them with a newline character
const pageLines = pageText.items.map((item) => item.str).join("\n");
// Append the lines from this page to the extracted text
if (extractedText !== "") {
extractedText += "\n";
}
extractedText += pageLines;
}
// Segment the extracted text into sections
const sections = segmentText(extractedText);
// Update state with segmented text
setExtractedText(sections);
setPdfSrc(URL.createObjectURL(selectedFile));
setSelectedFileName(selectedFile.name);
} catch (error) {
console.error("Error parsing PDF:", error);
}
};
setExtractedText("");
fileReader.readAsArrayBuffer(selectedFile);
};
// Function to segment text into sections
const segmentText = (text) => {
// Split text into lines
const lines = text.split("\n");
// Define section keywords
const sectionKeywords = ["education", "experience", "skills", "summary"];
// Initialize sections object
const sections = {};
// Initialize current section
let currentSection = "";
// Iterate over lines to identify section boundaries
lines.forEach((line) => {
const lowerCaseLine = line.toLowerCase();
// Check if line contains a section keyword
const matchedKeyword = sectionKeywords.find(keyword => lowerCaseLine.includes(keyword));
if (matchedKeyword) {
currentSection = matchedKeyword;
if (!sections[currentSection]) {
sections[currentSection] = [];
}
} else {
// Add line to current section
if (currentSection !== "") {
sections[currentSection].push(line);
}
}
});
return sections;
};
const openFileDialog = () => {
if (fileInputRef.current) {
fileInputRef.current.click();
}
};
return (
<div>
<input
type="file"
onChange={handleFileChange}
accept=".pdf"
ref={fileInputRef}
style={{ display: "none" }}
/>
<button className="UploadButton" onClick={openFileDialog}>
Upload PDF
</button>
<div className="ScrollableContainer">
{Object.keys(extractedText).map((section, index) => (
<div key={index}>
<h2>{section.toUpperCase()}</h2>
<ul>
{extractedText[section].map((item, idx) => (
<li key={idx}>{item}</li>
))}
</ul>
</div>
))}
</div>
</div>
);
}
export default PDFParser;