我正在努力使用 C# 中的一个应用程序,它是一个文档审阅应用程序,用于在文档中搜索搜索词列表。它需要获取 PDF 或 WORD 文档,将来可能还需要获取其他文档。
我有一个枚举
DocumentType
:
public enum DocumentType
{
NotSpecified,
PDF,
WORD
}
和基类
DocumentInterface<DocumentType>
:
public abstract class DocumentInterface<DocumentType>
{
private string _documentPath;
private readonly DocumentType _document;
private int _pageCount;
private int _currentPage;
// Path to the document being reviewed
public string DocumentPath
{
get { return _documentPath; }
set { _documentPath = value; }
}
// Document Object will need to be typecast in child classes
public abstract DocumentType Document { get; set; }
//{
// get { return _document; }
// set { _document = value; }
//}
// Count of pages in the review document
public int PageCount
{
get { return _pageCount; }
set { _pageCount = value; }
}
// to keep track of where we are up to in the document
public int CurrentPage
{
get { return _currentPage; }
set { _currentPage = value; }
}
// reports if there are any more pages after the current position.
public bool HasMorePages { get { return _currentPage < _pageCount; } }
public string GetNextPageContents()
{
// Makes sure that there is a next page and if so then uses the abstract method
// to return the page contents.
if (HasMorePages)
{
_currentPage++;
return GetPageContents(_currentPage);
}
return string.Empty;
}
#region Constructor & Destructor
public DocumentInterface(string documentpath)
{
_document = OpenDocument(documentpath);
// Make sure we opened the document successfully
if (_document != null)
{
_pageCount = GetPageCount();
_currentPage = 0;
}
}
// Class Destructor, must close the document when going out of scope.
~DocumentInterface()
{
// Abstract method, must be implemented by child classes.
CloseDocument();
}
#endregion
#region Abstract Methods
// Abstract Class must be implemented by Document Type specific child classes
public abstract DocumentType OpenDocument(string path);
public abstract int GetPageCount();
public abstract string GetPageContents(int pageNumber);
public abstract void CloseDocument();
#endregion
}
和派生类
DocumentInterfacePDF
:
public class DocumentInterfacePDF : DocumentInterface<PdfDocument>
{
// Implementation of the abstract base generic as a PdfDocument
private PdfDocument _document;
public override PdfDocument Document
{
get => (PdfDocument)_document;
set => _document = value as PdfDocument;
}
/// <summary>
///
/// </summary>
/// <param name="documentpath"></param>
/// <exception cref="Exception"></exception>
public override PdfDocument OpenDocument(string documentpath)
{
if (// All need to be true!
documentpath != null
&& !string.IsNullOrEmpty(documentpath)
&& ".pdf" == documentpath.Substring(documentpath.Length - 4).ToLower()
)
{
// Open the PDF
PdfReader reader = new PdfReader(documentpath);
// return to base class to assign to _document
return new PdfDocument(reader);
}
return null;
}
#region Base Class Overrides to implement as a iText7 PDF Interface
/// <summary>
/// Gets the number of pages in the PDF document
/// </summary>
/// <returns></returns>
public override int GetPageCount()
{
// Return the Page Count
return Document.GetNumberOfPages();
}
/// <summary>
/// Gets the page contents for a specific page number
/// </summary>
/// <param name="pageNumber"></param>
/// <returns></returns>
public override string GetPageContents(int pageNumber)
{
// Set the default scanning extents for the PDF Reader
// numbers are points which are 72 to the inch
int A4_width = 595; // 210mm
int A4_height = 842; // 297mm
int header = 57; // 20mm
int footer = 57; // 20mm
var rect = new Rectangle(0, header, A4_width, A4_height - footer);
PdfPage page = Document.GetPage(pageNumber);
// Read the page contents
FilteredTextEventListener listener =
new FilteredTextEventListener(
new LocationTextExtractionStrategy(),
new TextRegionEventFilter(rect)
);
// Return the page contents
return PdfTextExtractor.GetTextFromPage(page, listener);
}
/// <summary>
/// Closes the PDF Document
/// </summary>
public override void CloseDocument()
{
// Close the document
Document.Close();
}
#endregion
#region Constructor
/// <summary>
/// Constructor
/// Call the base class constructor to setup everything in the predefined way.
/// </summary>
/// <param name="documentpath"></param>
public DocumentInterfacePDF(string documentpath) : base(documentpath)
{
// all of the implementation is taken care of in the Base Class
}
#endregion
}
在我的
DocumentParser
类中,我想使用基类实例化文档,以便可以在运行时根据用户选择的文档类型决定派生类型。
internal class ReviewDocumentParser
{
#region Properties
private List<SearchTerm> _searchterms;
private SearchResults _results;
private string _documentPath;
private string _searchTermsPath;
private DocumentInterface<DocumentType> _documentInterface;
public List<SearchTerm> SearchTerms
{
get { return _searchterms; }
set { _searchterms = value; }
}
public SearchResults Results
{
get { return _results; }
set { _results = value; }
}
public string DocumentPath
{
get { return _documentPath; }
set { _documentPath = value; }
}
public string SearchTermsPath
{
get { return _searchTermsPath; }
set { _searchTermsPath = value; }
}
private DocumentType _documentType;
public DocumentType DocumentType
{
get { return _documentType; }
set { _documentType = value; }
}
public DocumentInterface<DocumentType> DocumentInterface
{
get { return _documentInterface; }
set { _documentInterface = value; }
}
#endregion
//... unnecessary code ommitted
#region Constructor
public ReviewDocumentParser(DocumentType documenttype, string documentpath, string searchtermspath)
{
_documentType = documenttype;
_documentPath = documentpath;
_searchTermsPath = searchtermspath;
// Hook the Search Terms element up
_results = new SearchResults(_searchTermsPath);
switch (documenttype)
{
case DocumentType.PDF:
_documentInterface = new DocumentInterfacePDF(_documentPath);
break;
case DocumentType.WORD:
_documentInterface = new DocumentInterfaceWORD(_documentPath);
break;
case DocumentType.NotSpecified:
throw new NoDocumentParserFoundException("No suitable document parser found.");
}
}
#endregion
我试图解决以下错误,但没有运气,我一圈又一圈地用一个问题替换另一个问题。我开始认为我正在尝试做一些不可能的事情。我已经24小时处于停滞状态了。
错误 CS0029 无法将类型“PDF_Reader_Test.Model.DocumentInterfacePDF”隐式转换为“PDF_Reader_Test.Model.DocumentInterface”
我尝试将类型描述符添加到实例化语句中。 PdfDocument 是 iText7 包中定义的类型,我用它仅从 PDF 文档中提取文本。
case DocumentType.PDF:
_documentInterface = new DocumentInterfacePDF<PdfDocument>(_documentPath);
break;
但这只会产生不同的警告。我也尝试过修改基类和派生类定义,但我只是遇到了更大的问题,我认为它们(几乎)是正确的。
我期望基类能够分配一个派生类类型。我想在基类中定义广泛的行为,以便它保持一致,并且只覆盖 PDF 和 Word 的派生类中处理不同文件类型的位。我还没有解决Word版本。
所以,这里有几个问题,有些已经在评论中提到了。
命名约定:不要将你的基类称为“接口”。它不是一个接口,接口通常以“I”为前缀。更好的选择就是简单地
Document
或可能 DocumentBase
。此外,通常的做法是在泛型参数前面加上 T
,因此更好的选择可能是:class DocumentBase<TDocument>
。
处置: 根据经验,不要依赖终结器(析构函数)。你的班级应该实施
IDisposable
。然后,如果您的底层实现(例如 PdfDocument
)需要处置,您应该在 Dispose()
方法中执行此操作。
ReviewDocumentParser:在构造函数内完成所有昂贵的 IO 和文档加载是糟糕的设计。要么使文档不成为解析器状态的一部分,但创建一个返回文档实现的方法,或者将逻辑放入工厂方法中。
直接解决你的问题:你使用
DocumentType
既作为类型名称,又作为泛型参数的名称,这是不幸的,因为现在你不小心声明了 DocumentInterface<DocumentType> _documentInterface
使用枚举类型作为泛型参数,即这就是为什么你不能分配 DocumentInterfacePDF
,其中 PdfDocument
作为通用参数。
如您所见,这就是遵守正确的命名约定可以保护您免受此类错误的方式。
此外,我认为您可能根本不需要通用参数。您可以将其保留为派生类型的实现细节。
这是我如何解决这个问题的简化版本。 (免责声明:我对itext不熟悉,所以这只是关于类设计。)
public abstract class Document : IDisposable
{
public abstract int PageCount { get; }
public abstract string GetPageContents(int pageNumber);
// [...]
public abstract void Dispose();
}
public sealed class PdfDocument : Document
{
// full namespace to avoid name collisions
private readonly iText.Kernel.Pdf.PdfReader _reader;
private readonly iText.Kernel.Pdf.PdfDocument _document;
private bool _disposed;
public PdfDocument(string path)
{
_reader = new (path);
_document = new (_reader);
}
public override int PageCount => _document.GetNumberOfPages();
public override string GetPageContents(int pageNumber)
{
// [...]
return null;
}
public override void Dispose()
{
if (_disposed)
{
return;
}
_document.Close();
_reader.Close();
_disposed = true;
}
}
internal class ReviewDocumentParser
{
public Document Parse(DocumentType documenttype, string documentpath, string searchtermspath)
{
// [...]
switch (documenttype)
{
case DocumentType.PDF:
return new PdfDocument(_documentPath);
break;
// [...]
}
return null;
}
}