当前位置：首页 > news >正文

nltk关键字抽取与轻量级搜索引擎（Whoosh, ElasticSearcher）

news 来源：原创 2024/9/29 11:38:03

背景

有时候你想用一句完整的话或一个文本在基于关键字的搜索引擎里搜索，但是如果把整个文本放进去搜索的话，效果不是很好，因为你的搜索引擎是基于关键字而不是sematic search。那怎么抽取关键字呢？

利用NLTK抽取关键的代码

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')def extract_keywords(text):# Tokenize the textwords = word_tokenize(text)# Remove stopwordsstop_words = set(stopwords.words('english'))filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]print('filtered words:', filtered_words)# Calculate word frequencyfreq_dist = FreqDist(filtered_words)# Extract keywords based on frequency or other criteriakeywords = [word for word, freq in freq_dist.most_common(10)]  # Adjust the number of keywords as neededreturn keywordsif __name__ == '__main__':text = """Elasticsearch provides powerful search capabilities and is commonly used in production environments for large-scale document search and retrieval. However, it might be overkill for small projects or scenarios where simpler solutions like Whoosh are sufficient. Choose the solution that best fits your needs."""keywords = extract_keywords(text)print(keywords)

执行结果

filtered words: ['elasticsearch', 'provides', 'powerful', 'search', 'capabilities', 'commonly', 'used', 'production', 'environments', 'document', 'search', 'retrieval', 'however', 'might', 'overkill', 'small', 'projects', 'scenarios', 'simpler', 'solutions', 'like', 'whoosh', 'sufficient', 'choose', 'solution', 'best', 'fits', 'needs']
['search', 'elasticsearch', 'provides', 'powerful', 'capabilities', 'commonly', 'used', 'production', 'environments', 'document']

基于关键的搜索-whoosh

from keywords_extractor import *from whoosh.fields import Schema, TEXT
from whoosh.index import create_in, open_dir
from whoosh.qparser import QueryParser# Define the schema for the index
schema = Schema(question=TEXT(stored=True))# Create or open the index
INDEX_DIR = "indexdir"
ix = create_in(INDEX_DIR, schema)  # Use create_in for creating a new index or open_dir for opening an existing one# Index your documents (replace doc_content with the actual content of your documents)
writer = ix.writer()
doc_content = "what is angular"questions = ["How to implement autocomplete, I don't know?", "How does Angular work?", "how Python programming language", "Example question", "Another question"]for question in questions:writer.add_document(question=question)writer.commit()# Search using keywords
search_keywords = extract_keywords(doc_content)
query_str = " OR ".join(search_keywords)
print(query_str)with ix.searcher() as searcher:query_parser = QueryParser("question", ix.schema)query = query_parser.parse(query_str)results = searcher.search(query)for result in results:print(result)

执行结果

filtered words: ['angular']
angular
<Hit {'question': 'How does Angular work?'}>

基于关键搜索- elastic search

from elasticsearch import Elasticsearch# Connect to the Elasticsearch server (make sure it's running)
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])# Create an index
index_name = "your_index_name"if not es.indices.exists(index=index_name):es.indices.create(index=index_name, ignore=400)# Index a document (replace doc_content with the actual content of your documents)
doc_content = "This is the content of your document."
document = {"content": doc_content}es.index(index=index_name, body=document)# Search using keywords
search_keywords = extract_keywords(doc_content)
query_body = {"query": {"terms": {"content": search_keywords}}
}results = es.search(index=index_name, body=query_body)for hit in results['hits']['hits']:print(hit['_source'])