• jieba 加whooh 构建自己本地数据库的搜索引擎


    例子

    from whoosh.index import create_in
    from whoosh.fields import Schema, TEXT, ID
    from jieba.analyse import ChineseAnalyzer
    from whoosh.qparser import QueryParser
    
    import os
    
    
    
    analyzer = ChineseAnalyzer()
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer), id=ID(stored=True))
    if not os.path.exists("index"):
        os.mkdir("index")
    ix = create_in("index", schema)
    
    
    documents = [
    	{
    		"title": "下文",
    		"content": "首先安装jieba和whoosh库,",
    		"id": "1"
    	},
    	{
    		"title": "中文自然语言处理",
    		"content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
    		"id": "2"
    	}
    ]
    
    writer = ix.writer() 
    for doc in documents:
        writer.add_document(title=doc["title"], content=doc["content"], id=doc["id"])
    writer.commit()
    
    searcher = ix.searcher()
    query_parser = QueryParser("content", schema=ix.schema)
    search_input = "jieba和whoosh"
    query = query_parser.parse(search_input)
    results = searcher.search(query, limit=None)
    
    print(f"找到 {len(results)} 篇相关文档:")
    for result in results:
        print(f"{result['id']} - {result['title']}")
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44

    实战

    from whoosh.index import create_in,open_dir
    from whoosh.fields import Schema, TEXT, ID
    from jieba.analyse import ChineseAnalyzer
    from whoosh.qparser import QueryParser
    from whoosh.index import open_dir
    import os
    
    import jieba
    import pandas as pd
    
    from glob import glob
    from multiprocessing import Process, freeze_support
    
    from tqdm import tqdm
    
    
    class GenVocTensorForDataSet:
        def __init__(self):
            pass
    
        @staticmethod
        def gen_data_tensor(data_v, out_dir, process_count):
            """
    
            :param data_v:
            :param out_dir:
            :param process_count:
            :return:
            """
            total_l = []
            one_p_count = 0
            for one_v in tqdm(data_v):
                one_p_count += 1
    
                with open(one_v, "r", encoding="utf-8") as f:
                    total_str = f.read()
                    total_str = "".join(total_str.split())
                one_data = list(jieba.cut(total_str))
                documents = []
                text = ""
                for one in one_data:
                    text += one
                    if text not in total_str[len("".join(documents)) + len(text):]:
                        documents.append(text)
                        text = ""
                total_l.append(documents)
            pd.to_pickle({"voc": total_l},
                         out_dir + "/{}{}.pandas_pickle_data_set".format(process_count, one_p_count))
    
        def gen_voc_data_to_tensor_set(self, paths_list_dir, out_dir, works_num=8):
            """
            唯一长度拆分
            :param paths_list_dir: 多个txt 的文件夹
            :param works_num:
            :return:
            """
            paths_list_pr = glob(pathname=paths_list_dir + "*")
    
            p_list = []
            # 发任务到异步进程
            for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
                j = len(paths_list_pr) // works_num + i
    
                p = Process(target=self.gen_data_tensor, args=(
                    paths_list_pr[i:j], out_dir, i))
                p.start()
                p_list.append(p)
    
            for p in p_list:
                p.join()
    
        @staticmethod
        def init_data_set(paths_list_dir):
            paths_list_pr = glob(pathname=paths_list_dir + "*")
            analyzer = ChineseAnalyzer()
            schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer),
                            id=ID(stored=True))
            if not os.path.exists("index"):
                os.mkdir("index")
            with create_in("index", schema, indexname='article_index') as ix:
    
    
                # documents = [
                #     {
                #         "title": "下文",
                #         "content": "首先安装jieba和whoosh库,",
                #         "id": "1"
                #     },
                #     {
                #         "title": "中文自然语言处理",
                #         "content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
                #         "id": "2"
                #     }
                # ]
    
                writer = ix.writer()
                total_count_id = 0
                for one_p in paths_list_pr:
                    documents = pd.read_pickle(one_p)
                    for doc in tqdm(documents["voc"]):
                        for doc_i, doc_j in zip(doc[1:], doc[:-1]):
                            writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
                            total_count_id += 1
                writer.commit()
    
        @staticmethod
        def add_data_set(paths_list_dir):
            paths_list_pr = glob(pathname=paths_list_dir + "*")
            with open_dir("indexdir", indexname='article_index') as ix:
                writer = ix.writer()
                total_count_id = 0
                for one_p in paths_list_pr:
                    documents = pd.read_pickle(one_p)
                    for doc in tqdm(documents["voc"]):
                        for doc_i, doc_j in zip(doc[1:], doc[:-1]):
                            writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
                            total_count_id += 1
                writer.commit()
    
    
        @staticmethod
        def search_by_jieba_world(search_text):
            ix = open_dir("index", indexname='article_index')
            with ix.searcher() as searcher:
                query_parser = QueryParser("content", schema=ix.schema)
                search_input = search_text
                query = query_parser.parse(search_input)
                results = searcher.search(query, limit=None)
    
                print(f"找到 {len(results)} 篇相关文档:")
                for result in results:
                    print(f"{result['id']} - {result['title']}")
            return results
    
    
    if __name__ == '__main__':
        freeze_support()
        txt_p = "E:/just_and_sum/data_sets/"
        gvt_fds = GenVocTensorForDataSet()
        # 生成分词库
        # gvt_fds.gen_voc_data_to_tensor_set(txt_p, "E:/just_and_sum/data_set_d",works_num=8)
        # 初始化数据库
        # data_base = gvt_fds.init_data_set("E:/just_and_sum/data_set_d/")
        # 搜索
        search_res = gvt_fds.search_by_jieba_world("头孢克洛头孢泊肟酯是同")
        print(search_res)
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
  • 相关阅读:
    企业办公文件数据防泄密系统 | 文件、文档、设计图纸、源代码、音视频等核心数据资料防止外泄!
    Rust初探: 实现一个Ping
    微信小程序——组件、API
    C++面试八股文:了解auto关键字吗?
    短视频发布的黄金时间,四点两天,找准发布时机才能事半功倍
    Alkyne-PEG-Silane,炔烃PEG硅烷,Alkyne-PEG-SIL
    五大经典智能算法实现机器人路径规划,包含简单路径与复杂路径,详细对比实验...
    栈和队列1——栈的实现及其oj(括号匹配问题)
    Netty 入门 — 亘古不变的Hello World
    GET 和 POST 有什么区别?
  • 原文地址:https://blog.csdn.net/weixin_32759777/article/details/132561301