在reactjs中解析Pdf中的文本

问题描述 投票:0回答:1

我正在使用 pdf.js 库从 PDF 文件中提取文本,但提取的文本格式不正确,有些行在末尾结束。 PDF 文件通常包含一份简历,由于不同的简历可能有不同的布局和文字结构,我如何将解析的文本分割成不同的部分,如介绍、教育和经验?

这是我将 pdf 解析为文本格式的代码

import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";



function PDFParser() {
  const [extractedText, setExtractedText] = useState("");
  const [pdfSrc, setPdfSrc] = useState(null);
  const [selectedFileName, setSelectedFileName] = useState("");
  const fileInputRef = useRef(null);

  const handleFileChange = async (event) => {
    const selectedFile = event.target.files[0];
  
    if (!selectedFile) {
      return;
    }
  
    const fileReader = new FileReader();
    fileReader.onload = async () => {
      const arrayBuffer = fileReader.result;
  
      try {
        pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
        const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
  
        const numPages = pdf.numPages;
        let extractedText = "";
  
        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const pageText = await page.getTextContent();
          
          // Map over text items and join them with a newline character
          const pageLines = pageText.items.map((item) => item.str).join("\n");
  
          // Append the lines from this page to the extracted text
          if (extractedText !== "") {
            extractedText += "\n";
          }
          extractedText += pageLines;
        }
  
        setExtractedText(extractedText);
        setPdfSrc(URL.createObjectURL(selectedFile));
        setSelectedFileName(selectedFile.name);
      } catch (error) {
        console.error("Error parsing PDF:", error);
        
      }
    };
  
    setExtractedText("");
    fileReader.readAsArrayBuffer(selectedFile);
  };
    return (
       <div>
        <input
          type="file"
          onChange={handleFileChange}
          accept=".pdf"
          ref={fileInputRef}
          style={{ display: "none" }}
        />
        <button className="UploadButton" onClick={openFileDialog}>
          Upload PDF
        </button>
      <div className="ScrollableContainer">
            {extractedText && (
             
              <HTMLContent text={extractedText}/>
              
            )}
          </div>
          </div>

 );
}

我尝试将其转换为 html,但 pdfjs-dist 不允许将其正确转换为 html

那么有人可以建议我解析文本的其他方法或建议一些帮助我做到这一点的库

html reactjs parsing frontend pdfjs-dist
1个回答
0
投票
import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";

function PDFParser() {
  const [extractedText, setExtractedText] = useState("");
  const [pdfSrc, setPdfSrc] = useState(null);
  const [selectedFileName, setSelectedFileName] = useState("");
  const fileInputRef = useRef(null);

  const handleFileChange = async (event) => {
    const selectedFile = event.target.files[0];
  
    if (!selectedFile) {
      return;
    }
  
    const fileReader = new FileReader();
    fileReader.onload = async () => {
      const arrayBuffer = fileReader.result;
  
      try {
        pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
        const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
  
        const numPages = pdf.numPages;
        let extractedText = "";
  
        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const pageText = await page.getTextContent();
          
          // Map over text items and join them with a newline character
          const pageLines = pageText.items.map((item) => item.str).join("\n");
  
          // Append the lines from this page to the extracted text
          if (extractedText !== "") {
            extractedText += "\n";
          }
          extractedText += pageLines;
        }

        // Segment the extracted text into sections
        const sections = segmentText(extractedText);

        // Update state with segmented text
        setExtractedText(sections);
        setPdfSrc(URL.createObjectURL(selectedFile));
        setSelectedFileName(selectedFile.name);
      } catch (error) {
        console.error("Error parsing PDF:", error);
      }
    };
  
    setExtractedText("");
    fileReader.readAsArrayBuffer(selectedFile);
  };

  // Function to segment text into sections
  const segmentText = (text) => {
    // Split text into lines
    const lines = text.split("\n");

    // Define section keywords
    const sectionKeywords = ["education", "experience", "skills", "summary"];

    // Initialize sections object
    const sections = {};

    // Initialize current section
    let currentSection = "";

    // Iterate over lines to identify section boundaries
    lines.forEach((line) => {
      const lowerCaseLine = line.toLowerCase();

      // Check if line contains a section keyword
      const matchedKeyword = sectionKeywords.find(keyword => lowerCaseLine.includes(keyword));
      if (matchedKeyword) {
        currentSection = matchedKeyword;
        if (!sections[currentSection]) {
          sections[currentSection] = [];
        }
      } else {
        // Add line to current section
        if (currentSection !== "") {
          sections[currentSection].push(line);
        }
      }
    });

    return sections;
  };

  const openFileDialog = () => {
    if (fileInputRef.current) {
      fileInputRef.current.click();
    }
  };

  return (
    <div>
      <input
        type="file"
        onChange={handleFileChange}
        accept=".pdf"
        ref={fileInputRef}
        style={{ display: "none" }}
      />
      <button className="UploadButton" onClick={openFileDialog}>
        Upload PDF
      </button>
      <div className="ScrollableContainer">
        {Object.keys(extractedText).map((section, index) => (
          <div key={index}>
            <h2>{section.toUpperCase()}</h2>
            <ul>
              {extractedText[section].map((item, idx) => (
                <li key={idx}>{item}</li>
              ))}
            </ul>
          </div>
        ))}
      </div>
    </div>
  );
}

export default PDFParser;
© www.soinside.com 2019 - 2024. All rights reserved.