2022-08-08

Meilisearch Doc

Meili 是在挪威神話中的神，指”可愛的人”，是托爾的兄弟。

Preview

搜尋速度快、全文檢索、支援中文、容易安裝和維護

Here We Go !

安裝Server的方式

官方Server Install Doc有提供多種方式

我這邊使用的是Docker的方式

#取得Docker Image
docker pull getmeili/meilisearch:v0.28

#建立Container
docker run -p 7700:7700 -d MEILI_MASTER_KEY='MASTER_KEY' getmeili/meilisearch

這邊需要提到的是 MEILI_MASTER_KEY 是用來設定Server的金鑰，
如果於建立Container時添加這個屬性，
未來對這個Server操作的行為都需要帶上這個金鑰。

以直接對MeiliSearch Server API Request 的情況來說就需要帶上X-Meili-API-Key

匯入搜尋資料

這邊提供了兩種方式

一個是藉由JSON, NDJSON, 或 CSV 格式的檔案匯入
另一種則是藉由SDK或API進行建立資料

實際操作如下

這邊使用Golang作為示範

官方Add Documents Doc有提供其他語言的範例

1	go get -u github.com/meilisearch/meilisearch-go

批量檔案匯入

package main

import (
  "os"
  "encoding/json"
  "io/ioutil"

  "github.com/meilisearch/meilisearch-go"
)

func main() {
  client := meilisearch.NewClient(meilisearch.ClientConfig{
    Host: "http://127.0.0.1:7700",
  })

  jsonFile, _ := os.Open("movies.json")
  defer jsonFile.Close()

  byteValue, _ := ioutil.ReadAll(jsonFile)
  var movies []map[string]interface{}
  json.Unmarshal(byteValue, &movies)

  _, err := client.Index("movies").AddDocuments(movies)
  if err != nil {
      panic(err)
  }
}

藉由SDK寫入

package main

import (
	"fmt"
	"os"

	"github.com/meilisearch/meilisearch-go"
)

func main() {
	client := meilisearch.NewClient(meilisearch.ClientConfig{
		Host: "http://127.0.0.1:7700",
	})

	index := client.Index("game")

	documents := []map[string]interface{}{
		{"id": 1, "title": "星海爭霸", "genres": []string{"即時戰略", "科幻", "戰爭"}},
		{"id": 2, "title": "仙境傳說RO", "genres": []string{"角色扮演", "MMORPG", "線上遊戲"}},
		{"id": 3, "title": "暗黑破壞神", "genres": []string{"刷裝", "角色扮演", "動作"}},
		{"id": 4, "title": "英雄聯盟", "genres": []string{"MOBA", "DOTA"}},
		{"id": 5, "title": "魔獸世界", "genres": []string{"角色扮演", "MMORPG", "科幻", "線上遊戲"}},
		{"id": 6, "title": "絕對武力", "genres": []string{"第一人稱射擊遊戲", "FPS"}},
	}
	task, err := index.AddDocuments(documents)
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}

	fmt.Println(task.TaskUID)
}

其中欄位並非固定除了用來識別的ID外

package main

import (
	"fmt"
	"os"

	"github.com/meilisearch/meilisearch-go"
)

func main() {
	client := meilisearch.NewClient(meilisearch.ClientConfig{
		Host: "http://127.0.0.1:7700",
	})

	index := client.Index("freeDoc2")

	documents := []map[string]interface{}{
		{"id": 1, "忘了填": "沒關係"},
		{"id": 2, "title": "棉豆腐", "好評推薦": []string{"你", "我", "還有隻貪睡的貓"}},
		{"id": 3, "馬斯克": "我到底該不該收購twitter還是在操作一波讓他降價呢?"},
	}
	task, err := index.AddDocuments(documents)
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}

	fmt.Println(task.TaskUID)
}

編輯資料

documents := []map[string]interface{}{
	{
		"id":     6,
		"title":  "絕對武力Counter-Strike",
		"genres": []string{"第一人稱射擊遊戲", "FPS", "comedy⚡️⚡️"},
	},
}
client.Index("game").UpdateDocuments(documents, "")

搜尋

這邊先提一下目前架設的環境

流程圖

所以之後的操作會以SDK對Meilisearch Server行為為主

搜尋的重點
1.對哪個集合操作 (Index)
2.搜尋的項目方式 (search parameters)

type SearchRequest struct {
	Offset                int64
	Limit                 int64
	AttributesToRetrieve  []string //對index資料建立時的類別做選擇(例如movie的OVERVIEW ,POSTER 等等)
	AttributesToCrop      []string //Crop的部分是在針對該關鍵字的前後語句長度
	CropLength            int64
	CropMarker            string
	AttributesToHighlight []string //對指定的類別做搜尋結果做重點提示
	HighlightPreTag       string //對其搜尋結果做頭,
	HighlightPostTag      string //和尾的重點提示HTML結構 盡量一起使用且須注意前後是否相同
	Filter                interface{}
	ShowMatchesPosition   bool
	Facets                []string //like count & group by 
	PlaceholderSearch     bool
	Sort                  []string
}

// SearchResponse is the response body for search method
type SearchResponse struct {
	Hits               []interface{} `json:"hits"` //被搜尋到的結果
	EstimatedTotalHits int64         `json:"estimatedTotalHits"`
	Offset             int64         `json:"offset"`
	Limit              int64         `json:"limit"`
	ProcessingTimeMs   int64         `json:"processingTimeMs"`
	Query              string        `json:"query"`
	FacetDistribution  interface{}   `json:"facetDistribution,omitempty"`
}

Filter的操作就比較多樣化了

resp, err := client.Index("movies").Search("thriller", &meilisearch.SearchRequest{
  Filter: [][]string{
    []string{"genres = Horror", "genres = Mystery"},
    []string{"director = \"Jordan Peele\""},
  },
})

Search Sample

Basic Search Request

func main() {
    searchRes, err := client.Index("movies").Search("wonder",
        &meilisearch.SearchRequest{
            AttributesToHighlight: []string{"*"},
        })
    if err != nil {
        fmt.Println(err)
        os.Exit(1)
    }

    fmt.Println(searchRes.Hits)
}

Response

{
    "hits": [
        {
            "id": 2,
            "title": "Wonder Woman",
            "genres": ["Action", "Adventure"],
            "_formatted": {
                "id": 2,
                "title": "<em>Wonder</em> Woman"
            }
        }
    ],
    "offset": 0,
    "limit": 20,
    "processingTimeMs": 0,
    "query": "wonder"
}

Filter Search Request

searchRes, err := index.Search("wonder",
    &meilisearch.SearchRequest{
        Filter: "id > 1 AND genres = Action",
    })

Response

{
  "hits": [
    {
      "id": 2,
      "title": "Wonder Woman",
      "genres": ["Action","Adventure"]
    }
  ],
  "offset": 0,
  "limit": 20,
  "estimatedTotalHits": 1,
  "processingTimeMs": 0,
  "query": "wonder"
}

自己測試的結果
中文檢索的時候有時會無法搜尋到Array內中間字，
且對其字數有些例外狀況
可能是中文轉型的限制

Sample

前二字
top2

前三字
top3

中間二字
mid2

尾二字
end2

設置

停用詞

1
2
3

client.Index("movie").UpdateSettings(&meilisearch.Settings{
    StopWords:[]string{"the","and","的"}
})

排序規則

client.Index("movie"). UpdateRankingRules([]string{
  "words",
  "typo",
  "proximity",
  "attribute",
  "sort",
  "exactness",
  "release_date:desc"
})

這些設置可以有效的提高搜尋效果，比如使用停用詞之前，搜尋開源的書籍命中不了開源書籍，加了停用詞即可命中，因為配對時忽略了輸入内容包含的停用詞(無用詞的）。

另外，功能上沒有建議/關聯字（suggest），可以通過新建 index+searchableAttributes達到。

同義字設定

官方文檔

Example

client := meilisearch.NewClient(meilisearch.ClientConfig{
	Host: "http://127.0.0.1:7700",
})

index := client.Index("testSynonyms")

documents := []map[string]interface{}{
	{"id": 1, "title": "成語故事1", "Story": "英布漢初人，微賤的時候曾受黥（ㄑ｜ㄥˊ）刑，故又稱「黥布」。秦朝末年，英布原本跟隨項羽打天下，攻破咸陽，立下不少功勞，因此被項羽封為九江王。後來劉邦欲吸納他投效自己，安排見面時，卻故意在剛起床時召見他。英布來了之後，看到劉邦還坐在床上盥洗，感覺自己不受重視，立刻勃然大怒，後悔自己前來投效。但是等回到劉邦所賜的住所後，看到無論傢俱、食物以及侍從，規模都跟劉邦本人的一樣，英布因此而「大喜過望」。因為之前被召見時受到屈辱，現在又看到這麼多賞賜，與原本預期的不同，自然特別欣喜。後來「大喜過望」一語，就被用來形容因所得到的結果，超過原本預期而感到特別高興。"},
	{"id": 2, "title": "自我省思管理文章1", "Story": "日本女作家兼商界聞人曾野綾子在八十多歲時，出書(熟年的才情/天下雜誌出版）論述「如何怡然自得、樂觀奮進的度過晚年？她和夫婿三浦朱門同為日本優雅老化的典範。她先不論社福或立法，直接向自己和年長者提出六項挑戰，從「要求自己」做起"},
	{"id": 3, "title": "成語故事2", "Story": "到處充滿了怨恨的聲音。形容群眾普遍怨恨、不滿。《紅樓夢》第五六回：「那時裡外怨聲載道，豈不失了你們這樣人家的大禮。」《文明小史》第三一回：「伯集把帳一一的七折八扣算了，不管那些人叫苦連天，怨聲載道，就同了顧舉人出京。」也作「怨聲滿道」。"},
	{"id": 4, "title": "散文1", "Story": "我以前在家時晚上總是和哥哥姑姑一起玩兒撲克牌，在這件事上我就從來不認輸。又有一次，我和他們一起玩撲克牌，玩一個非常簡單的遊戲，但是由於我最後一步的粗心，輸掉了，但是我不服氣，我要求“再戰”，於是，我們就又玩了起來，但是，我這局又輸了，我又一次不認輸，反覆的重玩，輸了好幾局，我非常的生氣，把牌扔下就走了，然後姑姑對我說，不能做什麼事情都不服氣，輸了就是輸了，這是事實，要心平氣和，不能生氣。"},
}
if _, err := index.AddDocuments(documents); err != nil {
	fmt.Println(err)
}

synonyms := make(map[string][]string)

synonyms["快樂"] = []string{
	"大喜過望",
	"快活",
	"狂喜",
	"欣喜",
}
synonyms["不滿"] = []string{
	"不平",
	"不忿",
	"不服氣",
	"不順心",
	"怨聲載道",
}

_, err := client.Index("testSynonyms").UpdateSynonyms(&synonyms)
if err != nil {
	fmt.Println(err)
}

成果

TestUnHappySynonyms

斷詞分字

官方文檔

主要講述針對中文他是使用了(Jieba-RS)這個package

但針對其斷詞分字表根據這份文件提到

We will want in the future to allow user configuration for the tokenizer. This is taken into account in the design of the new Tokenizer.

目前meiliSearch並沒有對斷詞分字表開放對應的功能(API接口)，
如果有對應的需求，依照目前的版本只能仰賴外部處理。

部署

官方文檔提供了多種部署文件

請參照

GCP

結語

ES 做為老牌搜索引擎，功能基本滿足，但複雜，重量級，適合大數據量且上手慢。
Meili 設計目標針對數據在 500GB 左右的搜尋需求，極快，單文件，超輕量。

Saxon Blogs