Tweak token prioritization in Typesense (#5776)

* Tweak toke prioritization in typesense

* tweaks

* allow configuring max_candidates

* tweak max_candidates

* final changes
This commit is contained in:
aecsocket
2026-04-15 20:45:41 +01:00
committed by GitHub
parent 3d5f29a7a2
commit 546b117437
3 changed files with 325 additions and 14 deletions

View File

@@ -16,7 +16,7 @@ DATABASE_URL=postgresql://labrinth:labrinth@localhost/labrinth
DATABASE_MIN_CONNECTIONS=0 DATABASE_MIN_CONNECTIONS=0
DATABASE_MAX_CONNECTIONS=16 DATABASE_MAX_CONNECTIONS=16
SEARCH_BACKEND=meilisearch SEARCH_BACKEND=typesense
# Meilisearch configuration # Meilisearch configuration
MEILISEARCH_READ_ADDR=http://localhost:7700 MEILISEARCH_READ_ADDR=http://localhost:7700

View File

@@ -83,10 +83,16 @@ pub struct RequestConfig {
pub prioritize_exact_match: bool, pub prioritize_exact_match: bool,
#[serde(default = "default_prioritize_num_matching_fields")] #[serde(default = "default_prioritize_num_matching_fields")]
pub prioritize_num_matching_fields: bool, pub prioritize_num_matching_fields: bool,
#[serde(default = "default_prioritize_token_positions")]
pub prioritize_token_positions: bool,
#[serde(default = "default_drop_tokens_threshold")]
pub drop_tokens_threshold: usize,
#[serde(default)] #[serde(default)]
pub text_match_type: TextMatchType, pub text_match_type: TextMatchType,
#[serde(default)] #[serde(default)]
pub bucketing: Bucketing, pub bucketing: Bucketing,
#[serde(default = "default_max_candidates")]
pub max_candidates: usize,
} }
impl Default for RequestConfig { impl Default for RequestConfig {
@@ -98,32 +104,38 @@ impl Default for RequestConfig {
prioritize_exact_match: default_prioritize_exact_match(), prioritize_exact_match: default_prioritize_exact_match(),
prioritize_num_matching_fields: prioritize_num_matching_fields:
default_prioritize_num_matching_fields(), default_prioritize_num_matching_fields(),
prioritize_token_positions: default_prioritize_token_positions(),
drop_tokens_threshold: default_drop_tokens_threshold(),
text_match_type: TextMatchType::default(), text_match_type: TextMatchType::default(),
bucketing: Bucketing::default(), bucketing: Bucketing::default(),
max_candidates: default_max_candidates(),
} }
} }
} }
fn default_query_by() -> Vec<String> { fn default_query_by() -> Vec<String> {
[ // [
"name", // "name",
"indexed_name", // "indexed_name",
"slug", // "slug",
"author", // "author",
"indexed_author", // "indexed_author",
"summary", // "summary",
] // ]
.into_iter() ["name", "indexed_name", "slug", "author", "indexed_author"]
.map(str::to_string) .into_iter()
.collect() .map(str::to_string)
.collect()
} }
fn default_query_by_weights() -> Vec<u8> { fn default_query_by_weights() -> Vec<u8> {
vec![15, 15, 10, 3, 3, 1] // vec![15, 15, 10, 3, 3, 1]
vec![15, 15, 10, 3, 3]
} }
fn default_prefix() -> Vec<bool> { fn default_prefix() -> Vec<bool> {
vec![true, true, true, true, true, true] // vec![true, true, true, true, true, true]
vec![true, true, true, true, true]
} }
const fn default_prioritize_exact_match() -> bool { const fn default_prioritize_exact_match() -> bool {
@@ -134,6 +146,20 @@ const fn default_prioritize_num_matching_fields() -> bool {
false false
} }
const fn default_prioritize_token_positions() -> bool {
// true
false
}
const fn default_drop_tokens_threshold() -> usize {
// 0
1
}
const fn default_max_candidates() -> usize {
8
}
impl TypesenseConfig { impl TypesenseConfig {
pub fn new(meta_namespace: Option<String>) -> Self { pub fn new(meta_namespace: Option<String>) -> Self {
Self { Self {
@@ -696,6 +722,14 @@ impl SearchBackend for Typesense {
.prioritize_num_matching_fields .prioritize_num_matching_fields
.to_string(), .to_string(),
), ),
(
"prioritize_token_positions",
info.typesense_config.prioritize_token_positions.to_string(),
),
(
"drop_tokens_threshold",
info.typesense_config.drop_tokens_threshold.to_string(),
),
( (
"text_match_type", "text_match_type",
info.typesense_config.text_match_type.as_str().to_string(), info.typesense_config.text_match_type.as_str().to_string(),
@@ -707,6 +741,10 @@ impl SearchBackend for Typesense {
("group_limit", "1".to_string()), ("group_limit", "1".to_string()),
("facet_by", "project_id".to_string()), ("facet_by", "project_id".to_string()),
("max_facet_values", "0".to_string()), ("max_facet_values", "0".to_string()),
(
"max_candidates",
info.typesense_config.max_candidates.to_string(),
),
]; ];
if let Some(query_by_weights) = if let Some(query_by_weights) =
Self::query_by_weights(&info.typesense_config) Self::query_by_weights(&info.typesense_config)

273
scripts/import-projects.py Executable file
View File

@@ -0,0 +1,273 @@
#!/usr/bin/env python3
"""
Search projects on api.modrinth.com and import results into the local database
with correct author names.
Modes:
search - Import top N results for a text query
top - Import the top N projects by total downloads (for building a
representative corpus that mirrors prod IDF distributions)
Usage:
python3 scripts/import-projects.py search <query> [limit]
python3 scripts/import-projects.py top [count]
Examples:
python3 scripts/import-projects.py search "sodium" 5
python3 scripts/import-projects.py top 1000
"""
import json
import subprocess
import sys
import time
import urllib.parse
import urllib.request
ADMIN_USER_ID = 103587649610509
DB_CONTAINER = "labrinth-postgres"
DB_USER = "labrinth"
DB_NAME = "labrinth"
API_BASE = "https://api.modrinth.com/v2"
HEADERS = {"User-Agent": "import-projects-script/1.0"}
seen_slugs = set()
author_user_ids = {}
next_user_id = 200_000_000_000_000
def api_get(url):
req = urllib.request.Request(url, headers=HEADERS)
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read().decode())
def psql(sql):
result = subprocess.run(
[
"podman",
"exec",
DB_CONTAINER,
"psql",
"-U",
DB_USER,
"-d",
DB_NAME,
"-c",
sql,
],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f" DB error: {result.stderr.strip()}", file=sys.stderr)
return False
return True
def sql_escape(s):
return s.replace("'", "''")
def get_or_create_author_user(author_name):
global next_user_id
if author_name in author_user_ids:
return author_user_ids[author_name]
uid = next_user_id
next_user_id += 1
name_e = sql_escape(author_name)
sql = f"""
INSERT INTO users (id, username, email, created, role)
VALUES ({uid}, '{name_e}', '{name_e}@imported.local', NOW(), 'developer')
ON CONFLICT (id) DO NOTHING;
"""
if psql(sql):
author_user_ids[author_name] = uid
else:
author_user_ids[author_name] = ADMIN_USER_ID
return author_user_ids[author_name]
def import_project(hit, counter):
slug = hit.get("slug", "")
if slug in seen_slugs:
return False
seen_slugs.add(slug)
title = hit.get("title", "")
summary = hit.get("description", "")[:2048]
project_id_api = hit.get("project_id", "")
downloads = hit.get("downloads", 0)
follows = hit.get("follows", 0)
icon_url = hit.get("icon_url") or None
author_name = hit.get("author", "Unknown")
print(f" Fetching: {title}")
try:
project_data = api_get(f"{API_BASE}/project/{project_id_api}")
description = (project_data.get("body") or "")[:65536]
icon_url = project_data.get("icon_url") or icon_url
except Exception:
description = summary
author_id = get_or_create_author_user(author_name)
base = int(time.time() * 1e9) % 900_000_000_000_000 + 100_000_000_000_000
mod_id = base + counter * 5
team_id = base + counter * 5 + 1
member_id = base + counter * 5 + 2
version_id = base + counter * 5 + 3
title_e = sql_escape(title)
summary_e = sql_escape(summary)
description_e = sql_escape(description)
slug_e = sql_escape(slug)
icon_col = f"'{sql_escape(icon_url)}'" if icon_url else "NULL"
print(
f" Importing: {title} (author={author_name}, downloads={downloads}, followers={follows})"
)
sql = f"""
BEGIN;
INSERT INTO teams (id) VALUES ({team_id});
INSERT INTO mods (
id, team_id, name, summary, description,
published, downloads, follows,
status, license, side_types_migration_review_status,
components, monetization_status, slug,
icon_url, raw_icon_url
) VALUES (
{mod_id},
{team_id},
'{title_e}',
'{summary_e}',
'{description_e}',
NOW(),
{downloads},
{follows},
'approved',
'LicenseRef-All-Rights-Reserved',
'reviewed',
'{{}}'::jsonb,
'monetized',
LOWER('{slug_e}'),
{icon_col},
{icon_col}
);
INSERT INTO team_members (
id, team_id, user_id, role, permissions,
accepted, payouts_split, ordering, is_owner
) VALUES (
{member_id},
{team_id},
{author_id},
'Owner',
1275068466,
true,
1.00000000000000000000,
0,
true
);
INSERT INTO versions (
id, mod_id, name, version_number, version_type,
author_id, downloads, changelog, status, components
) VALUES (
{version_id},
{mod_id},
'1.0.0',
'1.0.0',
'release',
{author_id},
{downloads},
'',
'listed',
'{{}}'::jsonb
);
INSERT INTO loaders_versions (loader_id, version_id) VALUES (2, {version_id});
COMMIT;
"""
return psql(sql)
def mode_search(query, limit=5):
encoded_query = urllib.parse.quote(query)
search_url = f"{API_BASE}/search?query={encoded_query}&limit={limit}&facets=[]"
print(f"Searching Modrinth for: {query} (limit: {limit})")
search_data = api_get(search_url)
hits = search_data.get("hits", [])
if not hits:
print("No results found.")
return
imported = 0
for i, hit in enumerate(hits):
if import_project(hit, i):
imported += 1
print(f"Done. Imported {imported} project(s).")
def mode_top(count=1000):
print(f"Fetching top {count} projects by downloads from Modrinth...")
imported = 0
batch_size = 50
counter = 0
for offset in range(0, count, batch_size):
limit = min(batch_size, count - offset)
url = (
f"{API_BASE}/search?limit={limit}&offset={offset}&index=downloads&facets=[]"
)
print(f"\n Batch offset={offset}, limit={limit}")
data = api_get(url)
hits = data.get("hits", [])
if not hits:
break
for hit in hits:
if import_project(hit, counter):
imported += 1
counter += 1
time.sleep(1)
print(f"\nDone. Imported {imported} project(s).")
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} search <query> [limit]")
print(f" {sys.argv[0]} top [count]")
sys.exit(1)
mode = sys.argv[1]
if mode == "search":
if len(sys.argv) < 3:
print("Usage: {sys.argv[0]} search <query> [limit]")
sys.exit(1)
query = sys.argv[2]
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
mode_search(query, limit)
elif mode == "top":
count = int(sys.argv[2]) if len(sys.argv) > 2 else 1000
mode_top(count)
else:
print(f"Unknown mode: {mode}. Use 'search' or 'top'.")
sys.exit(1)
if __name__ == "__main__":
main()