分享
 
 
 

如何使用Lucene对html文件进行索引

王朝html/css/js·作者佚名  2008-05-19
窄屏简体版  字體: |||超大  

我修改了lucene的demo包的IndexHTML类,使其可以被其他Java类调用。

IndexHTML类

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.TermEnum;

import java.io.File;import java.util.Date;

import java.util.Arrays;

//还需调用demo的其他类。

import org.apache.lucene.demo;

/**

* Create html file index for searching

* @author tyrone

*

*/public class IndexHTML { private String DocsPath=null;

/**

* the path for index file;

*/ private String IndexFilePath=null;

/**

* true during deletion pass

*/

private boolean deleting = false;

/**

* existing index

*/

private IndexReader reader;

/**

* new index being built

*/

private IndexWriter writer;

/**

* document id iterator

*/

private TermEnum uidIter;

private void indexDocs(File file)throws Exception {

if (file.isDirectory())

{

// if a directory

String[] files = file.list();

// list its files

Arrays.sort(files);

// sort the files

for (int i = 0; i < files.length;

i++)

// recursively index them

this.indexDocs(new File(file, files[i]));

} else if (file.getPath().endsWith(".html") || // index .html files

file.getPath().endsWith(".htm") || // index .htm files

file.getPath().endsWith(".txt")) { // index .txt files

if (this.uidIter != null) {

String uid = HTMLDocument.uid(file);

// construct uid for doc

while (uidIter.term() != null && uidIter.term().field() == "uid" &&

uidIter.term().text().compareTo(uid) <0) {

if (deleting) {

// delete stale docs

System.out.println("deleting " +

HTMLDocument.uid2url(uidIter.term().text()));

reader.delete(uidIter.term());

}

uidIter.next();

}

if (uidIter.term() != null && uidIter.term().field() == "uid" &&

uidIter.term().text().compareTo(uid) == 0) {

uidIter.next();

// keep matching docs

} else if (!deleting) {

// add new docs

Document doc = HTMLDocument.Document(file);

System.out.println("adding " + doc.get("url"));

writer.addDocument(doc);

}

} else { // creating a new index

Document doc = HTMLDocument.Document(file);

System.out.println("adding " + doc.get("url"));

writer.addDocument(doc);

// add docs unconditionally

}

}

return;

}

/**

* Walk directory hierarchy in uid order, while keeping uid iterator from

* existing index in sync.

Mismatches indicate one of:

* (a) old documents to be deleted;

* (b) unchanged documents, to be left alone;

* or (c) new documents, to be indexed.

*/

private void indexDocs(File file, String index, boolean create)

throws Exception {

if (!create) {

// incrementally update

reader = IndexReader.open(index);

// open existing index

uidIter = reader.terms(new Term("uid", ""));

// init uid iterator

this.indexDocs(file);

if (deleting) {

// delete rest of stale docs

while (uidIter.term() != null && uidIter.term().field() == "uid") {

System.out.println("deleting " +

HTMLDocument.uid2url(uidIter.term().text()));

reader.delete(uidIter.term());

uidIter.next();

}

deleting = false;

}

uidIter.close();

// close uid iterator

reader.close();

// close existing index

} else

// don't have exisiting

this.indexDocs(file);

}

/**

* if create=true, create a new index, else refresh old index.

* @param create

*/ public void run(boolean create)

{

try {

String index = "index";

File root = null;

if (this.IndexFilePath!=null)

{

// index file path

index = this.IndexFilePath;

}

if (this.DocsPath==null){

System.out.println("root directory is not set");

return;

}

root = new File(this.DocsPath);

Date start = new Date();

/**

* not create then maintenance

*/

if (!create) {

// delete stale docs

this.deleting = true;

this.indexDocs(root, index, create);

}

writer = new IndexWriter(index, new StandardAnalyzer(), create);

writer.maxFieldLength = 1000000;

this.indexDocs(root, index, create);

// add new docs

System.out.println("Optimizing index...");

writer.optimize();

writer.close();

Date end = new Date();

System.out.print(end.getTime() - start.getTime());

System.out.println(" total milliseconds");

} catch (Exception e) {

System.out.println(" caught a " + e.getClass() +

"\n with message: " + e.getMessage());

}

return;

}

/**

* @return Returns the IndexFilePath.

*/ public String getIndexFilePath() {

return IndexFilePath;

}

/**

* @param IndexFilePath The IndexFilePath to set.

*/ public void setIndexFilePath(String property1) {

this.IndexFilePath = property1;

}

/**

* @return Returns the DocsPath.

*/ public String getDocsPath() {

return DocsPath;

}

/**

* @param DocsPath The DocsPath to set.

*/ public void setDocsPath(String property1) {

this.DocsPath = property1;

}

/**

* test

* @param args

*/ public static void main(String[] args){

IndexHTML ih=new IndexHTML();

ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");

ih.setIndexFilePath("D:\\MyProject\\colimas\\index");

ih.run(true); }}

运行后生成3个文件_3i8.cfs,deletable,segments

搜索文件类:

/*

* Created on 2005/07/28

*

* TODO To change the template for this generated file go to

* Window - Preferences - Java - Code Style - Code Templates

*/package com.nova.colimas.search.query;

/** * @author tyrone * * TODO To change the template for this generated type comment go to

* Window - Preferences - Java - Code Style - Code Templates

*/public class HitsHTMLDoc {

private String Title;

priva

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
2023年上半年GDP全球前十五强
 百态   2023-10-24
美众议院议长启动对拜登的弹劾调查
 百态   2023-09-13
上海、济南、武汉等多地出现不明坠落物
 探索   2023-09-06
印度或要将国名改为“巴拉特”
 百态   2023-09-06
男子为女友送行,买票不登机被捕
 百态   2023-08-20
手机地震预警功能怎么开?
 干货   2023-08-06
女子4年卖2套房花700多万做美容:不但没变美脸,面部还出现变形
 百态   2023-08-04
住户一楼被水淹 还冲来8头猪
 百态   2023-07-31
女子体内爬出大量瓜子状活虫
 百态   2023-07-25
地球连续35年收到神秘规律性信号,网友:不要回答!
 探索   2023-07-21
全球镓价格本周大涨27%
 探索   2023-07-09
钱都流向了那些不缺钱的人,苦都留给了能吃苦的人
 探索   2023-07-02
倩女手游刀客魅者强控制(强混乱强眩晕强睡眠)和对应控制抗性的关系
 百态   2020-08-20
美国5月9日最新疫情:美国确诊人数突破131万
 百态   2020-05-09
荷兰政府宣布将集体辞职
 干货   2020-04-30
倩女幽魂手游师徒任务情义春秋猜成语答案逍遥观:鹏程万里
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案神机营:射石饮羽
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案昆仑山:拔刀相助
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案天工阁:鬼斧神工
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案丝路古道:单枪匹马
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:与虎谋皮
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:李代桃僵
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:指鹿为马
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案金陵:小鸟依人
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案金陵:千金买邻
 干货   2019-11-12
 
推荐阅读
 
 
 
>>返回首頁<<
 
靜靜地坐在廢墟上,四周的荒凉一望無際,忽然覺得,淒涼也很美
© 2005- 王朝網路 版權所有