Tweak token prioritization in Typesense (#5776)

* Tweak toke prioritization in typesense * tweaks * allow configuring max_candidates * tweak max_candidates * final changes
2026-04-15 20:45:41 +01:00
parent 3d5f29a7a2
commit 546b117437
3 changed files with 325 additions and 14 deletions
--- a/apps/labrinth/.env.local
+++ b/apps/labrinth/.env.local
@@ -16,7 +16,7 @@ DATABASE_URL=postgresql://labrinth:labrinth@localhost/labrinth
 DATABASE_MIN_CONNECTIONS=0
 DATABASE_MAX_CONNECTIONS=16

-SEARCH_BACKEND=meilisearch
+SEARCH_BACKEND=typesense

 # Meilisearch configuration
 MEILISEARCH_READ_ADDR=http://localhost:7700
--- a/apps/labrinth/src/search/backend/typesense/mod.rs
+++ b/apps/labrinth/src/search/backend/typesense/mod.rs
@@ -83,10 +83,16 @@ pub struct RequestConfig {
    pub prioritize_exact_match: bool,
    #[serde(default = "default_prioritize_num_matching_fields")]
    pub prioritize_num_matching_fields: bool,
+    #[serde(default = "default_prioritize_token_positions")]
+    pub prioritize_token_positions: bool,
+    #[serde(default = "default_drop_tokens_threshold")]
+    pub drop_tokens_threshold: usize,
    #[serde(default)]
    pub text_match_type: TextMatchType,
    #[serde(default)]
    pub bucketing: Bucketing,
+    #[serde(default = "default_max_candidates")]
+    pub max_candidates: usize,
 }

 impl Default for RequestConfig {
@@ -98,32 +104,38 @@ impl Default for RequestConfig {
            prioritize_exact_match: default_prioritize_exact_match(),
            prioritize_num_matching_fields:
                default_prioritize_num_matching_fields(),
+            prioritize_token_positions: default_prioritize_token_positions(),
+            drop_tokens_threshold: default_drop_tokens_threshold(),
            text_match_type: TextMatchType::default(),
            bucketing: Bucketing::default(),
+            max_candidates: default_max_candidates(),
        }
    }
 }

 fn default_query_by() -> Vec<String> {
-    [
-        "name",
-        "indexed_name",
-        "slug",
-        "author",
-        "indexed_author",
-        "summary",
-    ]
-    .into_iter()
-    .map(str::to_string)
-    .collect()
+    // [
+    //     "name",
+    //     "indexed_name",
+    //     "slug",
+    //     "author",
+    //     "indexed_author",
+    //     "summary",
+    // ]
+    ["name", "indexed_name", "slug", "author", "indexed_author"]
+        .into_iter()
+        .map(str::to_string)
+        .collect()
 }

 fn default_query_by_weights() -> Vec<u8> {
-    vec![15, 15, 10, 3, 3, 1]
+    // vec![15, 15, 10, 3, 3, 1]
+    vec![15, 15, 10, 3, 3]
 }

 fn default_prefix() -> Vec<bool> {
-    vec![true, true, true, true, true, true]
+    // vec![true, true, true, true, true, true]
+    vec![true, true, true, true, true]
 }

 const fn default_prioritize_exact_match() -> bool {
@@ -134,6 +146,20 @@ const fn default_prioritize_num_matching_fields() -> bool {
    false
 }

+const fn default_prioritize_token_positions() -> bool {
+    // true
+    false
+}
+
+const fn default_drop_tokens_threshold() -> usize {
+    // 0
+    1
+}
+
+const fn default_max_candidates() -> usize {
+    8
+}
+
 impl TypesenseConfig {
    pub fn new(meta_namespace: Option<String>) -> Self {
        Self {
@@ -696,6 +722,14 @@ impl SearchBackend for Typesense {
                    .prioritize_num_matching_fields
                    .to_string(),
            ),
+            (
+                "prioritize_token_positions",
+                info.typesense_config.prioritize_token_positions.to_string(),
+            ),
+            (
+                "drop_tokens_threshold",
+                info.typesense_config.drop_tokens_threshold.to_string(),
+            ),
            (
                "text_match_type",
                info.typesense_config.text_match_type.as_str().to_string(),
@@ -707,6 +741,10 @@ impl SearchBackend for Typesense {
            ("group_limit", "1".to_string()),
            ("facet_by", "project_id".to_string()),
            ("max_facet_values", "0".to_string()),
+            (
+                "max_candidates",
+                info.typesense_config.max_candidates.to_string(),
+            ),
        ];
        if let Some(query_by_weights) =
            Self::query_by_weights(&info.typesense_config)
--- a/scripts/import-projects.py
+++ b/scripts/import-projects.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+"""
+Search projects on api.modrinth.com and import results into the local database
+with correct author names.
+
+Modes:
+  search  - Import top N results for a text query
+  top     - Import the top N projects by total downloads (for building a
+            representative corpus that mirrors prod IDF distributions)
+
+Usage:
+    python3 scripts/import-projects.py search <query> [limit]
+    python3 scripts/import-projects.py top [count]
+
+Examples:
+    python3 scripts/import-projects.py search "sodium" 5
+    python3 scripts/import-projects.py top 1000
+"""
+
+import json
+import subprocess
+import sys
+import time
+import urllib.parse
+import urllib.request
+
+ADMIN_USER_ID = 103587649610509
+DB_CONTAINER = "labrinth-postgres"
+DB_USER = "labrinth"
+DB_NAME = "labrinth"
+API_BASE = "https://api.modrinth.com/v2"
+HEADERS = {"User-Agent": "import-projects-script/1.0"}
+
+seen_slugs = set()
+author_user_ids = {}
+next_user_id = 200_000_000_000_000
+
+
+def api_get(url):
+    req = urllib.request.Request(url, headers=HEADERS)
+    with urllib.request.urlopen(req) as resp:
+        return json.loads(resp.read().decode())
+
+
+def psql(sql):
+    result = subprocess.run(
+        [
+            "podman",
+            "exec",
+            DB_CONTAINER,
+            "psql",
+            "-U",
+            DB_USER,
+            "-d",
+            DB_NAME,
+            "-c",
+            sql,
+        ],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        print(f"  DB error: {result.stderr.strip()}", file=sys.stderr)
+        return False
+    return True
+
+
+def sql_escape(s):
+    return s.replace("'", "''")
+
+
+def get_or_create_author_user(author_name):
+    global next_user_id
+    if author_name in author_user_ids:
+        return author_user_ids[author_name]
+    uid = next_user_id
+    next_user_id += 1
+    name_e = sql_escape(author_name)
+    sql = f"""
+    INSERT INTO users (id, username, email, created, role)
+    VALUES ({uid}, '{name_e}', '{name_e}@imported.local', NOW(), 'developer')
+    ON CONFLICT (id) DO NOTHING;
+    """
+    if psql(sql):
+        author_user_ids[author_name] = uid
+    else:
+        author_user_ids[author_name] = ADMIN_USER_ID
+    return author_user_ids[author_name]
+
+
+def import_project(hit, counter):
+    slug = hit.get("slug", "")
+    if slug in seen_slugs:
+        return False
+    seen_slugs.add(slug)
+
+    title = hit.get("title", "")
+    summary = hit.get("description", "")[:2048]
+    project_id_api = hit.get("project_id", "")
+    downloads = hit.get("downloads", 0)
+    follows = hit.get("follows", 0)
+    icon_url = hit.get("icon_url") or None
+    author_name = hit.get("author", "Unknown")
+
+    print(f"  Fetching: {title}")
+    try:
+        project_data = api_get(f"{API_BASE}/project/{project_id_api}")
+        description = (project_data.get("body") or "")[:65536]
+        icon_url = project_data.get("icon_url") or icon_url
+    except Exception:
+        description = summary
+
+    author_id = get_or_create_author_user(author_name)
+
+    base = int(time.time() * 1e9) % 900_000_000_000_000 + 100_000_000_000_000
+    mod_id = base + counter * 5
+    team_id = base + counter * 5 + 1
+    member_id = base + counter * 5 + 2
+    version_id = base + counter * 5 + 3
+
+    title_e = sql_escape(title)
+    summary_e = sql_escape(summary)
+    description_e = sql_escape(description)
+    slug_e = sql_escape(slug)
+    icon_col = f"'{sql_escape(icon_url)}'" if icon_url else "NULL"
+
+    print(
+        f"  Importing: {title} (author={author_name}, downloads={downloads}, followers={follows})"
+    )
+
+    sql = f"""
+BEGIN;
+
+INSERT INTO teams (id) VALUES ({team_id});
+
+INSERT INTO mods (
+    id, team_id, name, summary, description,
+    published, downloads, follows,
+    status, license, side_types_migration_review_status,
+    components, monetization_status, slug,
+    icon_url, raw_icon_url
+) VALUES (
+    {mod_id},
+    {team_id},
+    '{title_e}',
+    '{summary_e}',
+    '{description_e}',
+    NOW(),
+    {downloads},
+    {follows},
+    'approved',
+    'LicenseRef-All-Rights-Reserved',
+    'reviewed',
+    '{{}}'::jsonb,
+    'monetized',
+    LOWER('{slug_e}'),
+    {icon_col},
+    {icon_col}
+);
+
+INSERT INTO team_members (
+    id, team_id, user_id, role, permissions,
+    accepted, payouts_split, ordering, is_owner
+) VALUES (
+    {member_id},
+    {team_id},
+    {author_id},
+    'Owner',
+    1275068466,
+    true,
+    1.00000000000000000000,
+    0,
+    true
+);
+
+INSERT INTO versions (
+    id, mod_id, name, version_number, version_type,
+    author_id, downloads, changelog, status, components
+) VALUES (
+    {version_id},
+    {mod_id},
+    '1.0.0',
+    '1.0.0',
+    'release',
+    {author_id},
+    {downloads},
+    '',
+    'listed',
+    '{{}}'::jsonb
+);
+
+INSERT INTO loaders_versions (loader_id, version_id) VALUES (2, {version_id});
+
+COMMIT;
+"""
+    return psql(sql)
+
+
+def mode_search(query, limit=5):
+    encoded_query = urllib.parse.quote(query)
+    search_url = f"{API_BASE}/search?query={encoded_query}&limit={limit}&facets=[]"
+    print(f"Searching Modrinth for: {query} (limit: {limit})")
+
+    search_data = api_get(search_url)
+    hits = search_data.get("hits", [])
+
+    if not hits:
+        print("No results found.")
+        return
+
+    imported = 0
+    for i, hit in enumerate(hits):
+        if import_project(hit, i):
+            imported += 1
+
+    print(f"Done. Imported {imported} project(s).")
+
+
+def mode_top(count=1000):
+    print(f"Fetching top {count} projects by downloads from Modrinth...")
+
+    imported = 0
+    batch_size = 50
+    counter = 0
+
+    for offset in range(0, count, batch_size):
+        limit = min(batch_size, count - offset)
+        url = (
+            f"{API_BASE}/search?limit={limit}&offset={offset}&index=downloads&facets=[]"
+        )
+        print(f"\n  Batch offset={offset}, limit={limit}")
+
+        data = api_get(url)
+        hits = data.get("hits", [])
+
+        if not hits:
+            break
+
+        for hit in hits:
+            if import_project(hit, counter):
+                imported += 1
+            counter += 1
+
+        time.sleep(1)
+
+    print(f"\nDone. Imported {imported} project(s).")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} search <query> [limit]")
+        print(f"       {sys.argv[0]} top [count]")
+        sys.exit(1)
+
+    mode = sys.argv[1]
+
+    if mode == "search":
+        if len(sys.argv) < 3:
+            print("Usage: {sys.argv[0]} search <query> [limit]")
+            sys.exit(1)
+        query = sys.argv[2]
+        limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
+        mode_search(query, limit)
+    elif mode == "top":
+        count = int(sys.argv[2]) if len(sys.argv) > 2 else 1000
+        mode_top(count)
+    else:
+        print(f"Unknown mode: {mode}. Use 'search' or 'top'.")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()