Basics how to add delete and update documents in lucene


Cấu trúc thư mục như sau :

d1

 

Source code : Download

File IndexingTest.java

package thaihoanghai;

import java.io.IOException;

import junit.framework.TestCase;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;
/**
 * In this example say basics on how to add,delete, and update documents.
 * 
 * NOTE Users often confuse the maxDoc()and numDocs() methods in IndexWriter and IndexReader. 
 * The first method, maxDoc()returns the total number of deleted or undeleted 
 * documents in the index, whereas numDocs()returns only the number of undeleted documents.
 *
 */
public class IndexingTest extends TestCase{
	// Data used to test for this program
	//Example we has 1 table include fields
	//|-------------------------------------------------------------|
	//| ProductID	|	Name	|	ExpiryDate	|	Description		|
	//|-------------------------------------------------------------|
	//| ID001		|	Iphone4	|	20/10/2012	|	..ChipA6...v.v	|
	//| ID002		|	Iphone5	|	09/09/2013	|	..ChipA7...v.v	|
	//|-------------------------------------------------------------|
	protected String[] idProducts = {"id001", "id002", "id003"};
	protected String[] names = {"Iphone4", "Iphone5", "Iphone6"};
	protected String[] expiryDates = {"20/10/2012", "08/09/2013", "08/09/2013"};
	protected String[] descriptions = {"1023x896 HD, Camera 8PM, 64GB, Chip A6, 134gram",
									   "1024x887 HD, Camera 12PM, 32GB 64bit, Chip A7, 120gram",
									   "1024x887 HD, Camera 12PM, 32GB 64bit, Chip A8, 120gram"};

	private Directory directory;

	/**
	 * The setUp method create new RAMDirectory to hold the index
	 * It creates an indexWriter on this Directory and iterates over our item
	 * to create Document and Fields and add the Document to IndexWriter => [indexing]
	 * 
	 * Note : We could also have called commit(),which would commit the changes to the 
	 * directory but leave the writer open for further changes.
	 */
	protected void setUp() throws Exception { // Method run before every test
		directory = new RAMDirectory();
		// Create IndexWriter
		IndexWriter writer = getWriter();

		// Add Document
		for(int i = 0; i < idProducts.length; i++){
			Document doc = new Document();
			doc.add(new Field("ProductID",idProducts[i],Field.Store.YES, Field.Index.NOT_ANALYZED));

			doc.add(new Field("ProductName", names[i], Field.Store.YES,Field.Index.ANALYZED));

			doc.add(new Field("ExpiryDate", expiryDates[i],Field.Store.NO, Field.Index.ANALYZED));

			doc.add(new Field("Description", descriptions[i], Field.Store.YES, Field.Index.ANALYZED));

			writer.addDocument(doc);
		}

		writer.close();
	}
	/**
	 * Method used to create IndexWriter
	 * @return
	 * @throws CorruptIndexException
	 * @throws LockObtainFailedException
	 * @throws IOException
	 */
	private IndexWriter getWriter() throws CorruptIndexException, LockObtainFailedException, IOException{
		// Directory : Where the index is stored
		// Analyzer  : Used when we indexing it will token fields
		//			   WhitespaceAnalyzer token by white space
		// Constant MaxFieldLength.UNLIMITED:
		//			 :This constant require index all tokens in the document.

		return new IndexWriter(directory, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
	}

	/**
	 * Create New Searcher [IndexSearcher] and execute a basic query [single-term query].
	 * @param fieldName : field name used to search [example : ProductID or ExpiryDate ...]
	 * @param searchString : String compare
	 * @return : Numbers of document matched.
	 * @throws CorruptIndexException
	 * @throws IOException
	 */
	protected int getHitCount(String fieldName, String searchString) throws CorruptIndexException, IOException{
		IndexSearcher searcher = new IndexSearcher(directory);
		// Build simple single-term query
		Term t = new Term(fieldName, searchString);
		Query query = new TermQuery(t);
		// get number items search match condition
		int hitCount = TestUtil.hitCount(searcher, query);
		searcher.close();
		return hitCount;
	}

	// @Test purpose to verify writer document count.
	// count number document contain in IndexWriter
	@Test
	public void testIndexWriter() throws IOException{
		IndexWriter writer = getWriter();
		assertEquals(idProducts.length, writer.numDocs());
		writer.close();
	}

	// @Test purpose to verify document count in IndexReader
	@Test
	public void testIndexReader() throws IOException{
		IndexReader reader = IndexReader.open(directory);
		assertEquals(idProducts.length, reader.maxDoc());
		assertEquals(idProducts.length, reader.numDocs());
		reader.close();
	}

	/**
	 * Difference between two methods maxDoc() and numDocs() known the total number of deleted or undeleted 
	 * Because our index contains three documents, one of which is deleted, 
	 * numDocs()returns 2 and maxDocs()returns 3.
	 * @throws IOException
	 */
	@Test
	public void testDeleteBeforeOptimize() throws IOException{
		IndexWriter writer = getWriter();
		// We test verify 3 documents in index.
		assertEquals(3,  writer.numDocs());

		// We execute delete first document.
		writer.deleteDocuments(new Term("ProductID", "id001"));
		//writer.deleteDocuments(new Term("ProductID", "id002"));
		writer.commit();

		// Test verify to known index contains deletions.
		assertTrue(writer.hasDeletions());

		// Check to known ...has two document ? // result expect : 3
		assertEquals(3, writer.maxDoc());

		// check known that one document deleted, remaining 2 docs
		assertEquals(2, writer.numDocs());

		writer.close();
	}

	/**
	 * writer.optimize()=> force Lucene to merge index segments, after deleting one document
	 * Lucene truly removes the deleted document, 2 document remains in the index
	 * @throws CorruptIndexException
	 * @throws LockObtainFailedException
	 * @throws IOException
	 */
	@Test
	public void testDeleteAfterOptimize() throws CorruptIndexException, LockObtainFailedException, IOException{
		IndexWriter writer = getWriter();

		assertEquals(3, writer.numDocs());
		writer.deleteDocuments(new Term("ProductID", "id001"));
		writer.optimize();//Optimize to compact deletions
		writer.commit();

		assertFalse(writer.hasDeletions());	// 0 document deleted	
		assertEquals(2, writer.maxDoc());
		assertEquals(2, writer.numDocs());

		writer.close();
	}

	/**
	 * Update document basic we must deletes the entire document need update and then
	 * add a new document to index. update at here like as we replace document.
	 * @throws CorruptIndexException
	 * @throws IOException
	 */
	@Test
	public void testUpdate() throws CorruptIndexException, IOException{
		// Check Product exits or not to we update it
		assertEquals(1, getHitCount("ProductID", "id001"));

		IndexWriter writer = getWriter();

		// Create new document with field values as : ID004, Lumina4, 12/09/2013, 3D Plastic
		Document doc = new Document();
		doc.add(new Field("ProductID","id004",Field.Store.YES, Field.Index.NOT_ANALYZED));

		doc.add(new Field("ProductName", "Lumina4", Field.Store.YES,Field.Index.ANALYZED));

		doc.add(new Field("ExpiryDate", "12/09/2013",Field.Store.NO, Field.Index.ANALYZED));

		doc.add(new Field("Description", "3D Plastic", Field.Store.YES, Field.Index.ANALYZED));

		// execute update Product have id001 by new document
		// Replace with new version
		writer.updateDocument(new Term("ProductID","id001"), doc);

		writer.close();

		// check ProductName=Iphone4 exits or not
		assertEquals(0, getHitCount("ProductName", "Iphone4"));
		// check update product success or not
		assertEquals(1, getHitCount("ProductName", "Lumina4"));
	}

}

Reference : Ebook lucene in action 2th

Leave a comment