Let’s write some code

Here’s the significant part of some example code to implement this index plan.

void index(const string & datapath, const string & dbpath)
{
    // Hardcode field offsets for simplicity.
    const size_t FIELD_ID_NUMBER = 0;
    const size_t FIELD_TITLE = 2;
    const size_t FIELD_DESCRIPTION = 8;

    // Create or open the database we're going to be writing to.
    Xapian::WritableDatabase db(dbpath, Xapian::DB_CREATE_OR_OPEN);

    // Set up a TermGenerator that we'll use in indexing.
    Xapian::TermGenerator termgenerator;
    termgenerator.set_stemmer(Xapian::Stem("en"));

    ifstream csv(datapath.c_str());
    vector<string> fields;
    csv_parse_line(csv, fields);

    // Check the CSV header line matches our hard-code offsets.
    if (fields.at(FIELD_ID_NUMBER) != "id_NUMBER" ||
	fields.at(FIELD_TITLE) != "TITLE" ||
	fields.at(FIELD_DESCRIPTION) != "DESCRIPTION") {
	// The CSV format doesn't match what we expect.
	cerr << "CSV format has changed!" << endl;
	exit(1);
    }

    while (csv_parse_line(csv, fields)) {
	// 'fields' is a vector mapping from field number to value.
	// We look up fields with the 'at' method so we get an exception
	// if that field isn't set.
	//
	// We're just going to use DESCRIPTION, TITLE and id_NUMBER.
	const string & description = fields.at(FIELD_DESCRIPTION);
	const string & title = fields.at(FIELD_TITLE);
	const string & identifier = fields.at(FIELD_ID_NUMBER);

	// We make a document and tell the term generator to use this.
	Xapian::Document doc;
	termgenerator.set_document(doc);

	// Index each field with a suitable prefix.
	termgenerator.index_text(title, 1, "S");
	termgenerator.index_text(description, 1, "XD");

	// Index fields without prefixes for general search.
	termgenerator.index_text(title);
	termgenerator.increase_termpos();
	termgenerator.index_text(description);

	// Store all the fields for display purposes.
	doc.set_data(identifier + "\n" + title + "\n" + description);

	// We use the identifier to ensure each object ends up in the
	// database only once no matter how many times we run the
	// indexer.
	string idterm = "Q" + identifier;
	doc.add_boolean_term(idterm);
	db.replace_document(idterm, doc);
    }
}

A full copy of this code is available in code/c++/index1.cc.

You can run this code to index a sample data file (held in data/100-objects-v1.csv) to a database at path db as follows:

$ g++ `xapian-config --cxxflags` code/c++/index1.cc code/c++/support.cc -o index1 `xapian-config --libs`
$ ./index1 data/100-objects-v1.csv db