想說都建立 Elastic 不試試看這 DB 的搜尋效果就不夠意思了吧,就有了這篇 Elastic 搜尋篇

上次我們建立好了一個基礎的 Elastic 環境,這部份就來試試如何添加資料進 Elastic 並搜尋出來。
!!注意!! 以下因為 Elastic 已關閉安全驗證,並且接是使用 Python 去作操作 \

寫入 Data

from elasticsearch import Elasticsearch

# 1. 連接到 Elasticsearch
es = Elasticsearch("http://172.20.10.8:9200/") # 確保 Elasticsearch 在該位址運行

# 2. 建立索引
index_name = "defect_data"
mapping = {
    "mappings": {
        "properties": {
            "project": {
                "type": "text",
                "fields": {
                    "keyword": {"type": "keyword"}  # 新增 keyword 類型
                }
            },
            "title": {
                "type": "text",
                "fields": {
                    "keyword": {"type": "keyword"}  # 新增 keyword 類型
                }
            },
            "description": {"type": "text"},
            "project_title": {  # 組合索引欄位
                "type": "keyword"
            }
        }
    }
}
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created.")

# 3. 插入文件
for key, value in data.items():
    title = key
    description = value
    document = {
        "project": f"{project}",
        "title": f"{title}",
        "description": f"{description}",
        "project_title": f"{project}|{title}"  # 組合索引值
    }

    # 檢查是否在同一個 Project 下有相同的 Title
    query = {
        "query": {
            "term": {
                "project_title": f"{project}|{title}"
            }
        }
    }

    search_response = es.search(index=index_name, body=query)
    if search_response["hits"]["total"]["value"] == 0:
        response = es.index(index=index_name, body=document)
        print("Document inserted:", response)
    else:
        print("Document already exists. Skipping insert.")

查詢

from elasticsearch import Elasticsearch

# 1. 連接到 Elasticsearch
es = Elasticsearch("http://0.0.0.0:9200/") # 確保 Elasticsearch 在該位址運行
index_name = "defect_data"
query = {
    "query": {
        "multi_match": {
            "query": "RM",
            "fields": ["title", "description"]
        }
    }
}

search_response = es.search(index=index_name, body=query)
print(len(search_response['hits']['hits']))
for hits in search_response['hits']['hits']:
    print(hits)
    print("===================")

此方法是藉由外部啟動 LLM Embedding 所以不用搭建較多 Ram 的 Elastic

寫入 Data

from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

# 初始化 Elasticsearch 客戶端
es = Elasticsearch("http://0.0.0.0:9200")

model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)
# In case you want to reduce the maximum length:
model.max_seq_length = 131072

def generate_embedding(text):
    # 使用 SentenceTransformer 直接生成嵌入
    embedding = model.encode(text, convert_to_numpy=True)
    # 如果需要轉換為列表,使用 tolist()
    return embedding.tolist()

# 定義索引結構,包含 kNN 向量欄位
index_name = "defect_embed_data"
index_mapping = {
    "mappings": {
        "properties": {
            "project": {
                "type": "text",
                "fields": {
                    "keyword": {"type": "keyword"}  # 保留 keyword 用於精確查詢
                }
            },
            "title": {
                "type": "text",
                "fields": {
                    "keyword": {"type": "keyword"}  # 保留 keyword 用於精確查詢
                }
            },
            "description": {
                "type": "text"  # 保留 description 作為全文檢索用
            },
            "project_title": {
                "type": "keyword"  # 用於組合索引查詢
            },
            "embedding": {  # 新增 embedding 欄位
                "type": "dense_vector",
                "dims": 1536,  # 嵌入向量的維度,需與嵌入模型一致
                "similarity": "cosine"
            }
        }
    }
}

# 建立索引
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)

data = [
    {
        "project": "Login System",
        "title": "Fix login timeout issue",
        "description": "Users are experiencing timeouts when logging in.",
        "project_title": "Login System - Fix login timeout issue"
    },
    {
        "project": "Database",
        "title": "Database connection failure",
        "description": "Database connection intermittently fails.",
        "project_title": "Database - Connection failure"
    },
    {
        "project": "UI",
        "title": "UI layout breaks on small screens",
        "description": "Responsive design issue causing layout to break.",
        "project_title": "UI - Layout issue on small screens"
    }
]

# 匯入資料到 Elasticsearch
for record in data:
    embedding = generate_embedding(record["title"])  # 生成嵌入向量
    record["embedding"] = embedding  # 加入嵌入
    es.index(index=index_name, body=record)  # 匯入到 Elasticsearch
print("資料匯入完成")

向量查詢

# 查詢標題
query_title = "Timeout during login"
# 生成查詢向量
query_vector = generate_embedding(query_title)

# kNN 搜尋
knn_query = {
    "size": 5,  # 返回最相似的 5 筆結果
    "knn": {
        "field": "embedding",
        "query_vector": query_vector,
        "k": 10,
        "num_candidates": 100
    },
    "_source": ["title", "description"]  # 指定返回的欄位
}

# 執行查詢
response = es.search(index=index_name, body=knn_query)

# 顯示結果
for hit in response["hits"]["hits"]:
    print(f"Title: {hit['_source']['title']}, Score: {hit['_score']}")

Hybrid Search

# 構建 Hybrid 查詢
query_text = "Troubleshooting login issues"
query_vector = generate_embedding(query_text)  # 生成查詢向量

hybrid_query = {
    "size": 5,
    "_source": ["title", "description"],
    "query": {
        "bool": {
            "should": [
                {
                    "match": {
                        "title": {
                            "query": query_title,
                            "boost": 1.5  # 可選:提高標題匹配的權重
                        }
                    }
                },
                {
                    "match": {
                        "description": {
                            "query": query_title,
                            "boost": 1.0
                        }
                    }
                },
                {
                    "script_score": {
                        "query": {
                            "exists": {
                                "field": "embedding"  # 確保文檔有 embedding 欄位
                            }
                        },
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                            "params": {
                                "query_vector": query_vector
                            }
                        },
                        "boost": 0.5  # 向量相似性的權重
                    }
                }
            ],
            "minimum_should_match": 1
        }
    }
}

補充:
如果是要對全文件向量搜尋 script_score 可以改成以下

# 匹配所有文檔的查詢
"script_score": {
    "query": {
        "match_all": {}
    },

總結:

  • 普通搜尋:
    Elastic 先把句子把你的字詞記下地理位置,之後搜尋時依照你輸入的詞彙去對應地址去搜尋大致相對應地理位置的DB中的句子,並使用 BM25 方式區對你的句子去搜尋,依照出現的頻率去計算平均出現機率。
  • 向量搜尋:
    是依照句子轉換成向量之後去使用 歐氏距離 與 餘弦相似度 去搜尋相似度。
  • 混合搜尋:
    透過你設定普通搜尋會有分數,向量搜尋也會有分數,之後透過 script_score 的計算方式去計算出你的混合分數。

Reference

密集向量欄位類型 - elastic 中文