我正在编写一个基本的 Lucene.Net 应用程序来索引本质上是论坛帖子的内容。为了简化,每个
Post
文档都有一个 URL
和一些 Content
。对于每个给定的线程,我将每个 Post
索引为单独的文档(将整个线程索引为单个文档在搜索时会返回太多误报)。
我遇到的问题是处理结果集中具有相同
Post
的多个 URL
文档。当我搜索并返回 10 个结果时,我希望每个结果引用不同的 URL
。
目前,我有以下内容:
// setup
StandardAnalyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
FSDirectory directory = FSDirectory.Open(indexLocation);
IndexSearcher searcher = new IndexSearcher(directory);
parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "body", analyzer);
// search
Query query = parser.Parse(queryString);
TopDocs topDocs = searcher.Search(query, null, 10);
但是,在返回的 10 个结果中,可能只有 7 个是唯一的
URLs
。我考虑过在再次搜索之前丢弃这些重复项,返回更大的结果集并丢弃前 10 个(类似于分页),直到我有 10 个唯一的 URLs
,但这引发了一些问题,例如我应该何时停止,因为没有更多结果?等等
感觉应该有一种在
TopDocs topDocs = searcher.Search()
点进行过滤的方法,返回10个具有唯一URLs
的结果。我找不到任何与此相关的内容(也许我没有使用正确的术语),但我确信很多其他应用程序之前一定已经解决了这个问题...类似的事情是否已经存在,或者任何人都可以提供指针如何实施?
我为 lucene 2.9.x 开发了额外过滤器等功能 现在我发现它应该为 4.8 完全重写
因此,如果您使用 2.9.x 版本,则有解决方案: 您自己的命中收集器,在方法收集中还应该检查文档是否位于唯一文档文档集(位数组)中。位数组必须被构造并缓存(来源):
public class DistinctValuesFilter
{
#region ctor
public DistinctValuesFilter(IndexReader searchReader, ISearchRequest request, int docLength, Analyzer anlzr)
{
this.distinctBy = StringHelper.Intern(request.DistinctBy);
this.processedMask = new OpenBitSetDISI(docLength);
FindDuplicateTermsDirectly(searchReader);
iireader = searchReader;
}
/// <summary>
/// initialize: duplicates hashset and array of possitions where duplicates are situated
/// code partially has been takend from lucene: Lucene.Net.Search.FieldCacheImpl.StringIndexCache
/// protected internal override object CreateValue(IndexReader reader, Entry entryKey)
/// </summary>
/// <param name="ireader">index reader to process duplicates</param>
private void FindDuplicateTermsDirectly(IndexReader ireader)
{
var maxlemgth = ireader.MaxDoc();
duplicates = new HashSet<int>(maxlemgth);
duplicatesLocations = new int[maxlemgth];
var termEnum = ireader.Terms(new Term(this.distinctBy));
var termDocs = ireader.TermDocs();
int num = 0;
int k, p;
int firstDocNo = -1;
try
{
do
{
var term = termEnum.Term();
if (term == null || term.Field() != this.distinctBy || num >= maxlemgth)
break;
termDocs.Seek(termEnum);
p = 0;
while (termDocs.Next())
{
k = termDocs.Doc();
duplicatesLocations[k] = num + 1;//0- needs to indicate that records/document completelly empty
if (p > 0) duplicates.Add(firstDocNo);
firstDocNo = k;
p++;
}
if (p > 1) duplicates.Add(firstDocNo);
num++;
}
while (termEnum.Next());
}
finally
{
termDocs.Close();
termEnum.Close();
}
}
protected IndexReader iireader;
protected string distinctBy;
private HashSet<int> duplicates;
private int[] duplicatesLocations;
private OpenBitSetDISI processedMask;
#endregion
private Lazy<string[]> ivalue => new Lazy<string[]>(() => FieldCache_Fields.DEFAULT.GetStrings(iireader, distinctBy));
public bool IsDistinct(int docIndex)
{
if (this.processedMask.FastGet(docIndex)) return false;
if (duplicatesLocations[docIndex] == 0) return false;//when value doesn't exist completelly
if (!duplicates.Contains(docIndex)) return true;
var dval = duplicatesLocations[docIndex];
var v = ivalue.Value;
var xv = string.Empty;
for (int i = 0; i < duplicatesLocations.Length; i++)
{
if (duplicatesLocations[i] == dval)
{
this.processedMask.FastSet(i);
if (!string.IsNullOrEmpty(xv) && xv != v[i])
{
throw new NotSupportedException($"values are not same ({i}): [{xv}] != [{v[i]}]");
}
xv = v[i];
}
}
return true;
}
}