Tweak token prioritization in Typesense (#5776)

* Tweak toke prioritization in typesense * tweaks * allow configuring max_candidates * tweak max_candidates * final changes
2026-04-15 20:45:41 +01:00
parent 3d5f29a7a2
commit 546b117437
3 changed files with 325 additions and 14 deletions
--- a/apps/labrinth/.env.local
+++ b/apps/labrinth/.env.local
@@ -16,7 +16,7 @@ DATABASE_URL=postgresql://labrinth:labrinth@localhost/labrinth
 DATABASE_MIN_CONNECTIONS=0
 DATABASE_MAX_CONNECTIONS=16
-SEARCH_BACKEND=meilisearch
+SEARCH_BACKEND=typesense
 # Meilisearch configuration
 MEILISEARCH_READ_ADDR=http://localhost:7700
--- a/apps/labrinth/src/search/backend/typesense/mod.rs
+++ b/apps/labrinth/src/search/backend/typesense/mod.rs
@@ -83,10 +83,16 @@ pub struct RequestConfig {
    pub prioritize_exact_match: bool,
    #[serde(default = "default_prioritize_num_matching_fields")]
    pub prioritize_num_matching_fields: bool,
    #[serde(default = "default_prioritize_token_positions")]
    pub prioritize_token_positions: bool,
    #[serde(default = "default_drop_tokens_threshold")]
    pub drop_tokens_threshold: usize,
    #[serde(default)]
    pub text_match_type: TextMatchType,
    #[serde(default)]
    pub bucketing: Bucketing,
    #[serde(default = "default_max_candidates")]
    pub max_candidates: usize,
 }
 impl Default for RequestConfig {
@@ -98,32 +104,38 @@ impl Default for RequestConfig {
            prioritize_exact_match: default_prioritize_exact_match(),
            prioritize_num_matching_fields:
                default_prioritize_num_matching_fields(),
            prioritize_token_positions: default_prioritize_token_positions(),
            drop_tokens_threshold: default_drop_tokens_threshold(),
            text_match_type: TextMatchType::default(),
            bucketing: Bucketing::default(),
            max_candidates: default_max_candidates(),
        }
    }
 }
 fn default_query_by() -> Vec<String> {
-    [
+    // [
-        "name",
+    //     "name",
-        "indexed_name",
+    //     "indexed_name",
-        "slug",
+    //     "slug",
-        "author",
+    //     "author",
-        "indexed_author",
+    //     "indexed_author",
-        "summary",
+    //     "summary",
-    ]
+    // ]
-    .into_iter()
+    ["name", "indexed_name", "slug", "author", "indexed_author"]
-    .map(str::to_string)
+        .into_iter()
-    .collect()
+        .map(str::to_string)
        .collect()
 }
 fn default_query_by_weights() -> Vec<u8> {
-    vec![15, 15, 10, 3, 3, 1]
+    // vec![15, 15, 10, 3, 3, 1]
    vec![15, 15, 10, 3, 3]
 }
 fn default_prefix() -> Vec<bool> {
-    vec![true, true, true, true, true, true]
+    // vec![true, true, true, true, true, true]
    vec![true, true, true, true, true]
 }
 const fn default_prioritize_exact_match() -> bool {
@@ -134,6 +146,20 @@ const fn default_prioritize_num_matching_fields() -> bool {
    false
 }
 const fn default_prioritize_token_positions() -> bool {
    // true
    false
 }
 const fn default_drop_tokens_threshold() -> usize {
    // 0
    1
 }
 const fn default_max_candidates() -> usize {
    8
 }
 impl TypesenseConfig {
    pub fn new(meta_namespace: Option<String>) -> Self {
        Self {
@@ -696,6 +722,14 @@ impl SearchBackend for Typesense {
                    .prioritize_num_matching_fields
                    .to_string(),
            ),
            (
                "prioritize_token_positions",
                info.typesense_config.prioritize_token_positions.to_string(),
            ),
            (
                "drop_tokens_threshold",
                info.typesense_config.drop_tokens_threshold.to_string(),
            ),
            (
                "text_match_type",
                info.typesense_config.text_match_type.as_str().to_string(),
@@ -707,6 +741,10 @@ impl SearchBackend for Typesense {
            ("group_limit", "1".to_string()),
            ("facet_by", "project_id".to_string()),
            ("max_facet_values", "0".to_string()),
            (
                "max_candidates",
                info.typesense_config.max_candidates.to_string(),
            ),
        ];
        if let Some(query_by_weights) =
            Self::query_by_weights(&info.typesense_config)
--- a/scripts/import-projects.py
+++ b/scripts/import-projects.py
@@ -0,0 +1,273 @@
 #!/usr/bin/env python3
 """
 Search projects on api.modrinth.com and import results into the local database
 with correct author names.
 Modes:
  search  - Import top N results for a text query
  top     - Import the top N projects by total downloads (for building a
            representative corpus that mirrors prod IDF distributions)
 Usage:
    python3 scripts/import-projects.py search <query> [limit]
    python3 scripts/import-projects.py top [count]
 Examples:
    python3 scripts/import-projects.py search "sodium" 5
    python3 scripts/import-projects.py top 1000
 """
 import json
 import subprocess
 import sys
 import time
 import urllib.parse
 import urllib.request
 ADMIN_USER_ID = 103587649610509
 DB_CONTAINER = "labrinth-postgres"
 DB_USER = "labrinth"
 DB_NAME = "labrinth"
 API_BASE = "https://api.modrinth.com/v2"
 HEADERS = {"User-Agent": "import-projects-script/1.0"}
 seen_slugs = set()
 author_user_ids = {}
 next_user_id = 200_000_000_000_000
 def api_get(url):
    req = urllib.request.Request(url, headers=HEADERS)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read().decode())
 def psql(sql):
    result = subprocess.run(
        [
            "podman",
            "exec",
            DB_CONTAINER,
            "psql",
            "-U",
            DB_USER,
            "-d",
            DB_NAME,
            "-c",
            sql,
        ],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        print(f"  DB error: {result.stderr.strip()}", file=sys.stderr)
        return False
    return True
 def sql_escape(s):
    return s.replace("'", "''")
 def get_or_create_author_user(author_name):
    global next_user_id
    if author_name in author_user_ids:
        return author_user_ids[author_name]
    uid = next_user_id
    next_user_id += 1
    name_e = sql_escape(author_name)
    sql = f"""
    INSERT INTO users (id, username, email, created, role)
    VALUES ({uid}, '{name_e}', '{name_e}@imported.local', NOW(), 'developer')
    ON CONFLICT (id) DO NOTHING;
    """
    if psql(sql):
        author_user_ids[author_name] = uid
    else:
        author_user_ids[author_name] = ADMIN_USER_ID
    return author_user_ids[author_name]
 def import_project(hit, counter):
    slug = hit.get("slug", "")
    if slug in seen_slugs:
        return False
    seen_slugs.add(slug)
    title = hit.get("title", "")
    summary = hit.get("description", "")[:2048]
    project_id_api = hit.get("project_id", "")
    downloads = hit.get("downloads", 0)
    follows = hit.get("follows", 0)
    icon_url = hit.get("icon_url") or None
    author_name = hit.get("author", "Unknown")
    print(f"  Fetching: {title}")
    try:
        project_data = api_get(f"{API_BASE}/project/{project_id_api}")
        description = (project_data.get("body") or "")[:65536]
        icon_url = project_data.get("icon_url") or icon_url
    except Exception:
        description = summary
    author_id = get_or_create_author_user(author_name)
    base = int(time.time() * 1e9) % 900_000_000_000_000 + 100_000_000_000_000
    mod_id = base + counter * 5
    team_id = base + counter * 5 + 1
    member_id = base + counter * 5 + 2
    version_id = base + counter * 5 + 3
    title_e = sql_escape(title)
    summary_e = sql_escape(summary)
    description_e = sql_escape(description)
    slug_e = sql_escape(slug)
    icon_col = f"'{sql_escape(icon_url)}'" if icon_url else "NULL"
    print(
        f"  Importing: {title} (author={author_name}, downloads={downloads}, followers={follows})"
    )
    sql = f"""
 BEGIN;
 INSERT INTO teams (id) VALUES ({team_id});
 INSERT INTO mods (
    id, team_id, name, summary, description,
    published, downloads, follows,
    status, license, side_types_migration_review_status,
    components, monetization_status, slug,
    icon_url, raw_icon_url
 ) VALUES (
    {mod_id},
    {team_id},
    '{title_e}',
    '{summary_e}',
    '{description_e}',
    NOW(),
    {downloads},
    {follows},
    'approved',
    'LicenseRef-All-Rights-Reserved',
    'reviewed',
    '{{}}'::jsonb,
    'monetized',
    LOWER('{slug_e}'),
    {icon_col},
    {icon_col}
 );
 INSERT INTO team_members (
    id, team_id, user_id, role, permissions,
    accepted, payouts_split, ordering, is_owner
 ) VALUES (
    {member_id},
    {team_id},
    {author_id},
    'Owner',
    1275068466,
    true,
    1.00000000000000000000,
    0,
    true
 );
 INSERT INTO versions (
    id, mod_id, name, version_number, version_type,
    author_id, downloads, changelog, status, components
 ) VALUES (
    {version_id},
    {mod_id},
    '1.0.0',
    '1.0.0',
    'release',
    {author_id},
    {downloads},
    '',
    'listed',
    '{{}}'::jsonb
 );
 INSERT INTO loaders_versions (loader_id, version_id) VALUES (2, {version_id});
 COMMIT;
 """
    return psql(sql)
 def mode_search(query, limit=5):
    encoded_query = urllib.parse.quote(query)
    search_url = f"{API_BASE}/search?query={encoded_query}&limit={limit}&facets=[]"
    print(f"Searching Modrinth for: {query} (limit: {limit})")
    search_data = api_get(search_url)
    hits = search_data.get("hits", [])
    if not hits:
        print("No results found.")
        return
    imported = 0
    for i, hit in enumerate(hits):
        if import_project(hit, i):
            imported += 1
    print(f"Done. Imported {imported} project(s).")
 def mode_top(count=1000):
    print(f"Fetching top {count} projects by downloads from Modrinth...")
    imported = 0
    batch_size = 50
    counter = 0
    for offset in range(0, count, batch_size):
        limit = min(batch_size, count - offset)
        url = (
            f"{API_BASE}/search?limit={limit}&offset={offset}&index=downloads&facets=[]"
        )
        print(f"\n  Batch offset={offset}, limit={limit}")
        data = api_get(url)
        hits = data.get("hits", [])
        if not hits:
            break
        for hit in hits:
            if import_project(hit, counter):
                imported += 1
            counter += 1
        time.sleep(1)
    print(f"\nDone. Imported {imported} project(s).")
 def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} search <query> [limit]")
        print(f"       {sys.argv[0]} top [count]")
        sys.exit(1)
    mode = sys.argv[1]
    if mode == "search":
        if len(sys.argv) < 3:
            print("Usage: {sys.argv[0]} search <query> [limit]")
            sys.exit(1)
        query = sys.argv[2]
        limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
        mode_search(query, limit)
    elif mode == "top":
        count = int(sys.argv[2]) if len(sys.argv) > 2 else 1000
        mode_top(count)
    else:
        print(f"Unknown mode: {mode}. Use 'search' or 'top'.")
        sys.exit(1)
 if __name__ == "__main__":
    main()