站内搜索引擎的实现

379 阅读2分钟

在Whoose搜索引擎和jieba分词的基础上使用Django Haystack实现网络搜索引擎

配置要求:

pip install django-haystack
pip install whoose
pip install jieba

settings.py

#配置搜索引擎
HAYSTACK_CONNECTIONS = {
    'default':{
        #设置搜索引擎
        'ENGINE':'index.whoosh_cn_backend.WhooshEngine',
        'PATH':os.path.join(BASE_DIR,'whoosh_index'),
        'INCLUDE_SPELLING':True,
    },
}
#设置每页显示的数据量
HAYSTACK_SEARCH_RESULTS_PER_PAGE = 4
#当数据库改变时,自动更新索引
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'

搜索引擎对应WhooshEngine,但是该类不提供中文搜索,自定义类继承该类,利用jieba重写build_schema方法,使其支持中文搜索

#whoosh_cn_backend.py名称不可修改
from haystack.backends.whoosh_backend import *
from jieba.analyse import ChineseAnalyzer
class MyWhooshSearchBackend(WhooshSearchBackend):
    def build_schema(self, fields):
        schema_fields = {
            ID:WHOOSH_ID(stored=True,unique=True),
            DJANGO_CT:WHOOSH_ID(stored=True),
            DJANGO_ID:WHOOSH_ID(stored=True),
        }
        initial_key_count = len(schema_fields)
        content_field_name = ''
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(),field_boost=field_class.boost, sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))

class WhooshEngine(BaseEngine):
    backend = MyWhooshSearchBackend
    query = WhooshSearchQuery

建立例子模型Product

from django.db import models

class Product(models.Model):
    id = models.AutoField('序号',primary_key=True)
    name = models.CharField('名称',max_length=50)
    weight = models.CharField('重量',max_length=20)
    describe =models.CharField('描述',max_length=500)

    def __str__(self):
        return self.name

利用搜索引擎创建索引

search_indexes.py名称不可修改
from haystack import indexes
from .models import Product
#创建索引类
#类名必须是模型名+Index
class ProductIndex(indexes.SearchIndex,indexes.Indexable):
    text = indexes.CharField(document=True,use_template=True)
    #设置模型
    def get_model(self):
        return Product
    #设置查找范围
    def index_queryset(self, using=None):
        return self.get_model().objects.all()

在templates/search/indexes/index/下建product_text.txt,此文件中定义引擎的索引字段

{{ object.name }}
{{ object.describe }}

创建完后利用python manage.py rebuild_index完成索引文件的创建

完成创建后在views中定义响应方法类,该类继承SearchView类

from django.shortcuts import render
from django.core.paginator import *
from django.conf import settings
from .models import *
from haystack.views import SearchView
#通用视图实现
class MySearchView(SearchView):
    #模板文件
    template = 'search.html'
    #重写响应方式
    #如果请求q为空,返回全部,否则返回相应请求
    def create_response(self):
        if not self.request.GET.get('q',''):
            show_all = True
            product = Product.objects.all().order_by('id')
            per = settings.HAYSTACK_SEARCH_RESULTS_PER_PAGE
            p = Paginator(product,per)
            try:
                num = int(self.request.GET.get('page',1))
                page = p.page(num)
            except PageNotAnInteger:
                page = p.page(1)
            except EmptyPage:
                page = p.page(p.num_pages)
            return render(self.request,self.template,locals())
        else:
            show_all = False
            qs = super(MySearchView,self).create_response()
            return qs