当前位置：首页 > news >正文

ElasticSearch Python API教程

news 来源：原创 2024/4/30 5:24:47

一、安装es python包

二、python API 基本使用

1. 创建es客户端连接

2. 创建索引

3. 插入数据

4. 删除索引

5. 查询

5.1 条件查询： match、match_all

5.2 条件查询: term、terms

5.3 条件查询：指定页码和大小

5.4 range查询

5.5 bool 查询 : 合并多个过滤条件查询结果的布尔逻辑

5.6 exist 查询: 存在或不存在某个字段的数据

5. 7 wildcards : 使用标准的shell通配符查询

5.8 prefix 查询：查询以什么字符开头的

6. 删除数据

7. 修改数据

一、安装es python包

注意: 版本要与es 环境保持一致

 pip install elasticsearch==7.8.0

二、python API 基本使用

1. 创建es客户端连接

    # 使用python操作ElasticSearch
    from elasticsearch import Elasticsearch, helpers
    # 连接ES
    es = Elasticsearch(hosts="http://192.168.21.103:9200", request_timeout=3600)

2. 创建索引

说明：指定mapping可以为对应字段构建索引，便于检索

def create_index(es, index_name):
    mappings = {
        "settings": {
            "index": {
                "number_of_shards": 1,
                "number_of_replicas": 1
            }
        },
        "mappings": {
            "properties": {
                "id": {
                    "type": "keyword"
                },
                "url": {
                    "type": "keyword"
                },
                "summary": {
                    "type": "text"
                },
                "author": {
                    "properties": {
                        "value": {
                            "type": "text"
                        }
                    }
                },
                "title": {
                    "type": "text"
                },
                "periodical": {
                    "properties": {
                        "name": {
                            "type": "text"
                        }
                    }
                },
                "doi": {
                    "type": "keyword"
                },
                "citations": {
                    "type": "keyword"
                },
                "year": {
                    "type": "keyword"
                },

            }
        }
    }
    # ignore : 如果索引已存在, 则忽略报错信息不进行创建
    if not es.indices.exists(index_name):
        result = es.indices.create(index=index_name, body=mappings, ignore=400)
        if result.get("acknowledged"):
            print("索引创建成功")
        else:
            print(f"索引创建失败:{result}")
    else:
        print("索引已存在无需重复创建!")

3. 插入数据

单条插入：

def insert_data(es, index_name, data):

    if data.get("id"): # 指定id
        es.index(index=index_name, id=data.get("id"), body=data)
        print("插入成功")
    else:  # 不指定id, 会自动生成id
        es.index(index=index_name, body=data)

批量插入

from elasticsearch import helpers

def bulk_list(es, index_name, data_list):
    """
    批量插入数据
    :param es: 
    :param index_name: 
    :param data_list: 
    """
    actions = []
    for data in data_list:
        action = {
            "_index": index_name,
            "_type": "_doc",
            "_id": data.get("id"),
            "_source": data
        }
        actions.append(action)
        if len(actions) % 100 == 0:
            helpers.bulk(es, actions)
            print(f"数据批量插入成功,数据大小:{len(actions)}, 索引:{index_name}")
            actions.clear()
    if actions:
        helpers.bulk(es, actions)
        print(f"数据批量插入成功,数据大小:{len(actions)}, 索引:{index_name}")
        actions.clear()

4. 删除索引

def delete_index(es, index_name):
    if es.indices.exists(index_name):
        es.indices.delete(index_name)
        print("索引删除成功")
    else:
        print("索引不存在")

5. 查询

5.1 条件查询： match、match_all

match_all : 查询到所有文档，默认返回一页

match ：使用关键词match，默认根据_socre降序排列

multi_match : 同时搜索多个字段

match_phrase : 短语查询

    body = {
        "query": {
            "match_all": {
            }
        }
    }

    body = {
        "query": {
            "match": {
                "title": "Compliance, identification, and internalization three processes of attitude change"
            }
        }
    }
    # multi_match 查询–match查询的基础上同时搜索多个字段，在多个字段中同时查一个
    body = {
        "query": {
            "multi_match": {
                "query": "comprehensive",
                "fields": ["title", "summary"]
            }
        }
    }
    # 短语匹配
    body = {
        "query": {
            "match_phrase": {
                "title": "modern marketing"
            }
        }
    }

    es.search(index=index_name, body=body)

5.2 条件查询: term、terms

term : 过滤–term主要用于精确匹配哪些值，比如数字，日期，布尔值或 not_analyzed 的字符串(未经切词的文本数据类型)

terms : 允许指定多个匹配条件

    body = {
        "query": {
            "term": {
                "year": 1958
            }
        }
    }

    body = {
        "query": {
            "terms": {
                "year": [1958, 2010]
            }
        }
    }
    es.search(index=index_name, body=body)

5.3 条件查询：指定页码和大小

    # 指定返回的数量大小
    body = {
        "query": {
            "match_all": {
            }
        },
        "from": 1, # 页码
        "size": 2 # 一页大小
    }
    es.search(index=index_name, body=body)

5.4 range查询

按照指定范围查询数据:

gt : 大于

gte: 大于等于

lt : 小于

lte : 小于等于

    body = {
        "query": {
            "range": {
                "year": {
                    "gt": 2018
                }
            }
        }
    }
    es.search(index=index_name, body=body)

5.5 bool 查询 : 合并多个过滤条件查询结果的布尔逻辑

must :: 多个查询条件的完全匹配,相当于 and。
must_not :: 多个查询条件的相反匹配，相当于 not。
should :: 至少有一个查询条件匹配, 相当于 or。

    body = {
        "query": {
            "bool": {
                    "must": [
                        {"term": {"year": 1958}},
                        {"term": {"doi": "10.5694/j.1326-5377.1958.tb67127.x"}}
                    ]
                }
            }
    }

    body = {
        "query": {
            "bool": {
                    "must": [
                        {"term": {"year": 1958}},
                        {"range": {"citations": {"gt": 3000}}}
                    ]
                }
            }
    }

    body = {
        "query": {
            "bool": {
                    "must": {
                        "term": {"year": 1958}
                    },
                    "must_not": {
                        "exists": {
                            "field": "name"
                        }
                    }
                }
            }
    }
    es.search(index=index_name, body=body)

5.6 exist 查询: 存在或不存在某个字段的数据

    # 查询存在year字段的数据
    body = {
        "query": {
            "exists": {
                "field": "year"
            }
        }
    }
    # 查询不存在year字段的数据
    body = {
        "query": {
            "bool": {
                "must_not": {
                    "exists": {
                        "field": "year"
                    }
                }

            }
        }
    }

5. 7 wildcards : 使用标准的shell通配符查询

    # wildcards 查询–使用标准的shell通配符查询
    body = {
        "query": {
            "wildcard": {
                "title": "*Structure*"
            }
        }
    }

    # wildcards 查询–使用标准的shell通配符查询
    body = {
        "query": {
            "regexp": {
                "year": "20.*"
            }
        }
    }

5.8 prefix 查询：查询以什么字符开头的

    # prefix 查询 – 以什么字符开头的
    body = {
        "query": {
            "prefix": {
                "id": "f1803ea131a96817d14290077"
            }
        }
    }

6. 删除数据

按照id删除

    es.delete(index=index_name, id='f1803ea131a96817d142900777cc1c73b41ee6c4')

删除符合条件的所有数据

    # 删除符合条件的所有数据
    body = {
        "query": {
            "match": {
                "year": 1958
            }
        }
    }
    es.delete_by_query(index=index_name, body=body)

7. 修改数据

lang：指定脚本语言,painless是内置的脚本语言

script: 代表脚本内容，ctx 代表es上下文，_source代表当前的文档，

    # 修改字段值，如果没有这个字段会自动添加
    doc_body = {
        "doc": {
            "citations": 2532
        }
    }
    # 增加字段
    doc_body = {
        'script': "ctx._source.source = 'kgPlat'"
    }

    # 字段
    doc_body = {
        'script': "ctx._source.remove('source')"
    }


    id = "727f736f07d9b0fd5ad95208079a09ee506e99e2"
    es.update(index=index_name, id=id, body=doc_body)


    # update_by_query：更新满足条件的所有数据，写法同上删除和查询
    query = {
        "query": {
            "match": {
                "year": 1991
            }
        },
        "script": {
            "source": "ctx._source.citations = params.citations;ctx._source.citations2 = params.citations2",
            "lang": "painless",
            "params": {
                "citations": 0,
                "citations2": 0
            },
        }
    }
    es.update_by_query(index=index_name, body=query)