Indexing a Document Collection

This section explains how to set the document URI to the filesystem path, so that the URI tree structure reflects the filesystem tree structure (required to be able to use PAPI enumeration to detect new and deleted documents).

This page discusses:

The following code snippets demonstrate the new Document constructor.

Java Code

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;

import com.exalead.papi.helper.Document;
import com.exalead.papi.helper.Part;
import com.exalead.papi.helper.PushAPI;
import com.exalead.papi.helper.PushAPIException;

public class FolderIndexer {
    public FolderIndexer(final PushAPI papi, final Logger logger) {
        this.papi = papi;
        this.logger = logger;
    }
    void index(final File folder) {
        for (final File file : folder.listFiles()) {
            if (file.isFile()) {
                try {
                    final InputStream stream = new BufferedInputStream
                        (new FileInputStream(file));
                    try {
                        final byte[] bytes = IOUtils.toByteArray(stream);
                        final Document doc = new Document(file.getAbsolutePath(),
String.valueOf(file.lastModified()));
                        doc.addPart(new Part(bytes));
                        papi.addDocument(doc);
                    } catch (final IOException e) {
                        logger.error("Could not read file " + file.getAbsolutePath(), e);
                    } catch (final PushAPIException e) {
                        logger.error("Could not send file to indexing server", e);                    }
                } catch (final FileNotFoundException e) {
                    logger.error("File does not exist: " + file.getAbsolutePath(), e);
                }
            }
        }
    }

    private final PushAPI papi;
    private final Logger logger;
}

C# Code

public void IndexDocumentCollection()
        {
            foreach (string uri in Directory.GetFiles("."))
            {
                Console.WriteLine("Push document : " + uri);

                Document doc = new Document(uri);

                FileInfo fileInfo = new FileInfo(uri);
        
                // the stamp associated to the document
                doc.Stamp = fileInfo.LastWriteTime.ToString();
        
                // create the metas
                MetaContainer metaContainer = new MetaContainer();
                metaContainer.AddMeta(new Meta("creation_date",
                     fileInfo.CreationTime.ToString()));
                metaContainer.AddMeta(new Meta("size",
                     fileInfo.Length.ToString()));
                doc.MetaContainer = metaContainer;

                PartContainer partContainer = new PartContainer();
                // master part
                byte[] bytes = File.ReadAllBytes(uri);
                Part masterPart = new Part(bytes);
                masterPart.Extension = fileInfo.Extension;

                partContainer.AddPart(masterPart);
                doc.PartContainer = partContainer;
        
                // push the document
                papi.AddDocument(doc);
            }
        }