将写内容过程比较重要的内容段备份一下,下面内容段是关于xapian 通过python创建索引数据库的复杂一些的范例的内容,应该对小伙伴们有些帮助。
#!/usr/bin/env python
Index each paragraph of a text file as a Xapian document.
Include some values that will be of use later.
Copyright (C) 2003,2008 James Aylett
Copyright (C) 2004,2007 Olly Betts
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
USA
import sys import xapian import string import time
if len(sys.argv) != 2: print >> sys.stderr, "Usage: %s PATH_TO_DATABASE" % sys.argv[0] sys.exit(1)
try: # Open the database for update, creating a new database if necessary. database = xapian.WritableDatabase(sys.argv[1], xapian.DB_CREATE_OR_OPEN)
indexer = xapian.TermGenerator()
stemmer = xapian.Stem("english")
indexer.set_stemmer(stemmer)
para = ''
try:
for line in sys.stdin:
line = string.strip(line)
if line == '':
if para != '':
# We've reached the end of a paragraph, so index it.
doc = xapian.Document()
doc.set_data(para)
indexer.set_document(doc)
tcount = indexer.get_termpos()
indexer.index_text(para)
tcount = indexer.get_termpos() - tcount
# Include two values: the length of the indexed paragraph
# (in characters), and the number of terms generated.
doc.add_value(0, xapian.sortable_serialise(len(para)))
doc.add_value(1, xapian.sortable_serialise(tcount))
# Add the document to the database.
database.add_document(doc)
para = ''
else:
if para != '':
para += ' '
para += line
except StopIteration:
pass
except Exception, e: print >> sys.stderr, "Exception: %s" % str(e) import traceback traceback.print_exc() sys.exit(1)