SarPig-SanPig: Java Code Index Search Engine Apache Lucene

IndexFiles

import java.io.*;

import java.util.Date;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.NumericField;

import org.apache.lucene.index.FieldInfo.IndexOptions;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.index.Term;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

/** Index all text files under a directory.

* <p>

* This is a command-line application demonstrating simple Lucene indexing.

* Run it with no command-line arguments for usage information.

public class IndexFiles {

private IndexFiles() {}

/** Index all text files under a directory. */

public static void main(String[] args) {

String usage = "java org.apache.lucene.demo.IndexFiles "

+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"

+ "This indexes the documents in DOCS_PATH, creating a Lucene index"

+ "in INDEX_PATH that can be searched with SearchFiles";

String indexPath = "index";

String docsPath = null;

boolean create = true;

for(int i=0;i<args.length;i++) {

if ("-index".equals(args[i])) {

indexPath = args[i+1];

i++;

} else if ("-docs".equals(args[i])) {

docsPath = args[i+1];

i++;

} else if ("-update".equals(args[i])) {

create = false;

}

if (docsPath == null) {

System.err.println("Usage: " + usage);

System.exit(1);

}

final File docDir = new File(docsPath);

if (!docDir.exists() || !docDir.canRead()) {

System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path");

System.exit(1);

}

Date start = new Date();

try {

System.out.println("Indexing to directory '" + indexPath + "'...");

Directory dir = FSDirectory.open(new File(indexPath));

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);

if (create) {

// Create a new index in the directory, removing any

// previously indexed documents:

iwc.setOpenMode(OpenMode.CREATE);

} else {

// Add new documents to an existing index:

iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);

}

// Optional: for better indexing performance, if you

// are indexing many documents, increase the RAM

// buffer. But if you do this, increase the max heap

// size to the JVM (eg add -Xmx512m or -Xmx1g):

// iwc.setRAMBufferSizeMB(256.0);

IndexWriter writer = new IndexWriter(dir, iwc);

indexDocs(writer, docDir);

// NOTE: if you want to maximize search performance,

// you can optionally call forceMerge here. This can be

// a terribly costly operation, so generally it's only

// worth it when your index is relatively static (ie

// you're done adding documents to it):

// writer.forceMerge(1);

writer.close();

Date end = new Date();

System.out.println(end.getTime() - start.getTime() + " total milliseconds");

} catch (IOException e) {

System.out.println(" caught a " + e.getClass() +

"\n with message: " + e.getMessage());

}

static void indexDocs(IndexWriter writer, File file)

throws IOException {

// do not try to index files that cannot be read

if (file.canRead()) {

if (file.isDirectory()) {

String[] files = file.list();

// an IO error could occur

if (files != null) {

for (int i = 0; i < files.length; i++) {

indexDocs(writer, new File(file, files[i]));

}

} else {

FileInputStream fis;

try {

fis = new FileInputStream(file);

} catch (FileNotFoundException fnfe) {

// at least on windows, some temporary files raise this exception with an "access denied" message

// checking if the file can be read doesn't help

return;

}

try {

Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);

pathField.setIndexOptions(IndexOptions.DOCS_ONLY);

doc.add(pathField);

NumericField modifiedField = new NumericField("modified");

modifiedField.setLongValue(file.lastModified());

doc.add(modifiedField);

doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {

System.out.println("adding " + file);

writer.addDocument(doc);

} else {

System.out.println("updating " + file);

writer.updateDocument(new Term("path", file.getPath()), doc);

}

} finally {

fis.close();

}

SearchFiles

import java.io.*;

import java.util.Date;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class SearchFiles {

private SearchFiles() {}

public static void main(String[] args) throws Exception {

String usage =

"Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/java/4_0/demo.html for details.";

if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {

System.out.println(usage);

System.exit(0);

}

String index = "index";

String field = "contents";

String queries = null;

int repeat = 0;

boolean raw = false;

String queryString = null;

int hitsPerPage = 10;

for(int i = 0;i < args.length;i++) {

if ("-index".equals(args[i])) {

index = args[i+1];

i++;

} else if ("-field".equals(args[i])) {

field = args[i+1];

i++;

} else if ("-queries".equals(args[i])) {

queries = args[i+1];

i++;

} else if ("-query".equals(args[i])) {

queryString = args[i+1];

i++;

} else if ("-repeat".equals(args[i])) {

repeat = Integer.parseInt(args[i+1]);

i++;

} else if ("-raw".equals(args[i])) {

raw = true;

} else if ("-paging".equals(args[i])) {

hitsPerPage = Integer.parseInt(args[i+1]);

if (hitsPerPage <= 0) {

System.err.println("There must be at least 1 hit per page.");

System.exit(1);

}

i++;

}

IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)));

IndexSearcher searcher = new IndexSearcher(reader);

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

BufferedReader in = null;

if (queries != null) {

in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));

} else {

in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));

}

QueryParser parser = new QueryParser(Version.LUCENE_31, field, analyzer);

while (true) {

if (queries == null && queryString == null) { // prompt the user

System.out.println("Enter query: ");

}

String line = queryString != null ? queryString : in.readLine();

if (line == null || line.length() == -1) {

break;

}

line = line.trim();

if (line.length() == 0) {

break;

}

Query query = parser.parse(line);

System.out.println("Searching for: " + query.toString(field));

if (repeat > 0) { // repeat & time as benchmark

Date start = new Date();

for (int i = 0; i < repeat; i++) {

searcher.search(query, null, 100);

}

Date end = new Date();

System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");

}

doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

if (queryString != null) {

break;

}

searcher.close();

reader.close();

}

public static void doPagingSearch(BufferedReader in, IndexSearcher searcher, Query query,

int hitsPerPage, boolean raw, boolean interactive) throws IOException {

// Collect enough docs to show 5 pages

TopDocs results = searcher.search(query, 5 * hitsPerPage);

ScoreDoc[] hits = results.scoreDocs;

int numTotalHits = results.totalHits;

System.out.println(numTotalHits + " total matching documents");

int start = 0;

int end = Math.min(numTotalHits, hitsPerPage);

while (true) {

if (end > hits.length) {

System.out.println("Only results 1 - " + hits.length +" of " + numTotalHits + " total matching documents collected.");

System.out.println("Collect more (y/n) ?");

String line = in.readLine();

if (line.length() == 0 || line.charAt(0) == 'n') {

break;

}

hits = searcher.search(query, numTotalHits).scoreDocs;

}

end = Math.min(hits.length, start + hitsPerPage);

for (int i = start; i < end; i++) {

if (raw) { // output raw format

System.out.println("doc="+hits[i].doc+" score="+hits[i].score);

continue;

}

Document doc = searcher.doc(hits[i].doc);

String path = doc.get("path");

if (path != null) {

System.out.println((i+1) + ". " + path);

String title = doc.get("title");

if (title != null) {

System.out.println(" Title: " + doc.get("title"));

}

} else {

System.out.println((i+1) + ". " + "No path for this document");

}

if (!interactive || end == 0) {

break;

}

if (numTotalHits >= end) {

boolean quit = false;

while (true) {

System.out.print("Press ");

if (start - hitsPerPage >= 0) {

System.out.print("(p)revious page, ");

}

if (start + hitsPerPage < numTotalHits) {

System.out.print("(n)ext page, ");

}

System.out.println("(q)uit or enter number to jump to a page.");

String line = in.readLine();

if (line.length() == 0 || line.charAt(0)=='q') {

quit = true;

break;

}

if (line.charAt(0) == 'p') {

start = Math.max(0, start - hitsPerPage);

break;

} else if (line.charAt(0) == 'n') {

if (start + hitsPerPage < numTotalHits) {

start+=hitsPerPage;

}

break;

} else {

int page = Integer.parseInt(line);

if ((page - 1) * hitsPerPage < numTotalHits) {

start = (page - 1) * hitsPerPage;

break;

} else {

System.out.println("No such page");

}

if (quit) break;

end = Math.min(numTotalHits, start + hitsPerPage);

}

1. Index untuk menyimpan hasil pengurutan document yang berada pada Koleksi dan folder index di update dengan data pada Koleksi yang terbaru.

2. Memanggil Index pada SearchFile dengan IndexReader, dan untuk yang Search index dengan IndexSearcher.

IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)));

IndexSearcher searcher = new IndexSearcher(reader);

3. Jika querynya masih kosong maka cetak “Enter Query” lalu inputkan data yang dicari dan akan di search dengan IndexSearch

SarPig-SanPig

Pemilik

Senin, 18 Juni 2012

Java Code Index Search Engine Apache Lucene

Statistik Pengunjung

Blog Archive