xapian 通过python创建索引数据库的复杂一些的范例的代码

276 阅读1分钟

将写内容过程比较重要的内容段备份一下,下面内容段是关于xapian 通过python创建索引数据库的复杂一些的范例的内容,应该对小伙伴们有些帮助。

#!/usr/bin/env python

Index each paragraph of a text file as a Xapian document.

Include some values that will be of use later.

Copyright (C) 2003,2008 James Aylett

Copyright (C) 2004,2007 Olly Betts

This program is free software; you can redistribute it and/or

modify it under the terms of the GNU General Public License as

published by the Free Software Foundation; either version 2 of the

License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with this program; if not, write to the Free Software

Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301

USA

import sys import xapian import string import time

if len(sys.argv) != 2: print >> sys.stderr, "Usage: %s PATH_TO_DATABASE" % sys.argv[0] sys.exit(1)

try: # Open the database for update, creating a new database if necessary. database = xapian.WritableDatabase(sys.argv[1], xapian.DB_CREATE_OR_OPEN)

indexer = xapian.TermGenerator()
stemmer = xapian.Stem("english")
indexer.set_stemmer(stemmer)

para = ''
try:
    for line in sys.stdin:
        line = string.strip(line)
        if line == '':
            if para != '':
                # We've reached the end of a paragraph, so index it.
                doc = xapian.Document()
                doc.set_data(para)

                indexer.set_document(doc)
                tcount = indexer.get_termpos()
                indexer.index_text(para)
                tcount = indexer.get_termpos() - tcount

                # Include two values: the length of the indexed paragraph
                # (in characters), and the number of terms generated.
                doc.add_value(0, xapian.sortable_serialise(len(para)))
                doc.add_value(1, xapian.sortable_serialise(tcount))

                # Add the document to the database.
                database.add_document(doc)
                para = ''
        else:
            if para != '':
                para += ' '
            para += line
except StopIteration:
    pass

except Exception, e: print >> sys.stderr, "Exception: %s" % str(e) import traceback traceback.print_exc() sys.exit(1)