Tweak token prioritization in Typesense (#5776)
* Tweak toke prioritization in typesense * tweaks * allow configuring max_candidates * tweak max_candidates * final changes
This commit is contained in:
@@ -16,7 +16,7 @@ DATABASE_URL=postgresql://labrinth:labrinth@localhost/labrinth
|
|||||||
DATABASE_MIN_CONNECTIONS=0
|
DATABASE_MIN_CONNECTIONS=0
|
||||||
DATABASE_MAX_CONNECTIONS=16
|
DATABASE_MAX_CONNECTIONS=16
|
||||||
|
|
||||||
SEARCH_BACKEND=meilisearch
|
SEARCH_BACKEND=typesense
|
||||||
|
|
||||||
# Meilisearch configuration
|
# Meilisearch configuration
|
||||||
MEILISEARCH_READ_ADDR=http://localhost:7700
|
MEILISEARCH_READ_ADDR=http://localhost:7700
|
||||||
|
|||||||
@@ -83,10 +83,16 @@ pub struct RequestConfig {
|
|||||||
pub prioritize_exact_match: bool,
|
pub prioritize_exact_match: bool,
|
||||||
#[serde(default = "default_prioritize_num_matching_fields")]
|
#[serde(default = "default_prioritize_num_matching_fields")]
|
||||||
pub prioritize_num_matching_fields: bool,
|
pub prioritize_num_matching_fields: bool,
|
||||||
|
#[serde(default = "default_prioritize_token_positions")]
|
||||||
|
pub prioritize_token_positions: bool,
|
||||||
|
#[serde(default = "default_drop_tokens_threshold")]
|
||||||
|
pub drop_tokens_threshold: usize,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub text_match_type: TextMatchType,
|
pub text_match_type: TextMatchType,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub bucketing: Bucketing,
|
pub bucketing: Bucketing,
|
||||||
|
#[serde(default = "default_max_candidates")]
|
||||||
|
pub max_candidates: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for RequestConfig {
|
impl Default for RequestConfig {
|
||||||
@@ -98,32 +104,38 @@ impl Default for RequestConfig {
|
|||||||
prioritize_exact_match: default_prioritize_exact_match(),
|
prioritize_exact_match: default_prioritize_exact_match(),
|
||||||
prioritize_num_matching_fields:
|
prioritize_num_matching_fields:
|
||||||
default_prioritize_num_matching_fields(),
|
default_prioritize_num_matching_fields(),
|
||||||
|
prioritize_token_positions: default_prioritize_token_positions(),
|
||||||
|
drop_tokens_threshold: default_drop_tokens_threshold(),
|
||||||
text_match_type: TextMatchType::default(),
|
text_match_type: TextMatchType::default(),
|
||||||
bucketing: Bucketing::default(),
|
bucketing: Bucketing::default(),
|
||||||
|
max_candidates: default_max_candidates(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_query_by() -> Vec<String> {
|
fn default_query_by() -> Vec<String> {
|
||||||
[
|
// [
|
||||||
"name",
|
// "name",
|
||||||
"indexed_name",
|
// "indexed_name",
|
||||||
"slug",
|
// "slug",
|
||||||
"author",
|
// "author",
|
||||||
"indexed_author",
|
// "indexed_author",
|
||||||
"summary",
|
// "summary",
|
||||||
]
|
// ]
|
||||||
|
["name", "indexed_name", "slug", "author", "indexed_author"]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(str::to_string)
|
.map(str::to_string)
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_query_by_weights() -> Vec<u8> {
|
fn default_query_by_weights() -> Vec<u8> {
|
||||||
vec![15, 15, 10, 3, 3, 1]
|
// vec![15, 15, 10, 3, 3, 1]
|
||||||
|
vec![15, 15, 10, 3, 3]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_prefix() -> Vec<bool> {
|
fn default_prefix() -> Vec<bool> {
|
||||||
vec![true, true, true, true, true, true]
|
// vec![true, true, true, true, true, true]
|
||||||
|
vec![true, true, true, true, true]
|
||||||
}
|
}
|
||||||
|
|
||||||
const fn default_prioritize_exact_match() -> bool {
|
const fn default_prioritize_exact_match() -> bool {
|
||||||
@@ -134,6 +146,20 @@ const fn default_prioritize_num_matching_fields() -> bool {
|
|||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const fn default_prioritize_token_positions() -> bool {
|
||||||
|
// true
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
const fn default_drop_tokens_threshold() -> usize {
|
||||||
|
// 0
|
||||||
|
1
|
||||||
|
}
|
||||||
|
|
||||||
|
const fn default_max_candidates() -> usize {
|
||||||
|
8
|
||||||
|
}
|
||||||
|
|
||||||
impl TypesenseConfig {
|
impl TypesenseConfig {
|
||||||
pub fn new(meta_namespace: Option<String>) -> Self {
|
pub fn new(meta_namespace: Option<String>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -696,6 +722,14 @@ impl SearchBackend for Typesense {
|
|||||||
.prioritize_num_matching_fields
|
.prioritize_num_matching_fields
|
||||||
.to_string(),
|
.to_string(),
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"prioritize_token_positions",
|
||||||
|
info.typesense_config.prioritize_token_positions.to_string(),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"drop_tokens_threshold",
|
||||||
|
info.typesense_config.drop_tokens_threshold.to_string(),
|
||||||
|
),
|
||||||
(
|
(
|
||||||
"text_match_type",
|
"text_match_type",
|
||||||
info.typesense_config.text_match_type.as_str().to_string(),
|
info.typesense_config.text_match_type.as_str().to_string(),
|
||||||
@@ -707,6 +741,10 @@ impl SearchBackend for Typesense {
|
|||||||
("group_limit", "1".to_string()),
|
("group_limit", "1".to_string()),
|
||||||
("facet_by", "project_id".to_string()),
|
("facet_by", "project_id".to_string()),
|
||||||
("max_facet_values", "0".to_string()),
|
("max_facet_values", "0".to_string()),
|
||||||
|
(
|
||||||
|
"max_candidates",
|
||||||
|
info.typesense_config.max_candidates.to_string(),
|
||||||
|
),
|
||||||
];
|
];
|
||||||
if let Some(query_by_weights) =
|
if let Some(query_by_weights) =
|
||||||
Self::query_by_weights(&info.typesense_config)
|
Self::query_by_weights(&info.typesense_config)
|
||||||
|
|||||||
273
scripts/import-projects.py
Executable file
273
scripts/import-projects.py
Executable file
@@ -0,0 +1,273 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Search projects on api.modrinth.com and import results into the local database
|
||||||
|
with correct author names.
|
||||||
|
|
||||||
|
Modes:
|
||||||
|
search - Import top N results for a text query
|
||||||
|
top - Import the top N projects by total downloads (for building a
|
||||||
|
representative corpus that mirrors prod IDF distributions)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/import-projects.py search <query> [limit]
|
||||||
|
python3 scripts/import-projects.py top [count]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
python3 scripts/import-projects.py search "sodium" 5
|
||||||
|
python3 scripts/import-projects.py top 1000
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
ADMIN_USER_ID = 103587649610509
|
||||||
|
DB_CONTAINER = "labrinth-postgres"
|
||||||
|
DB_USER = "labrinth"
|
||||||
|
DB_NAME = "labrinth"
|
||||||
|
API_BASE = "https://api.modrinth.com/v2"
|
||||||
|
HEADERS = {"User-Agent": "import-projects-script/1.0"}
|
||||||
|
|
||||||
|
seen_slugs = set()
|
||||||
|
author_user_ids = {}
|
||||||
|
next_user_id = 200_000_000_000_000
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(url):
|
||||||
|
req = urllib.request.Request(url, headers=HEADERS)
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
|
||||||
|
def psql(sql):
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"podman",
|
||||||
|
"exec",
|
||||||
|
DB_CONTAINER,
|
||||||
|
"psql",
|
||||||
|
"-U",
|
||||||
|
DB_USER,
|
||||||
|
"-d",
|
||||||
|
DB_NAME,
|
||||||
|
"-c",
|
||||||
|
sql,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f" DB error: {result.stderr.strip()}", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def sql_escape(s):
|
||||||
|
return s.replace("'", "''")
|
||||||
|
|
||||||
|
|
||||||
|
def get_or_create_author_user(author_name):
|
||||||
|
global next_user_id
|
||||||
|
if author_name in author_user_ids:
|
||||||
|
return author_user_ids[author_name]
|
||||||
|
uid = next_user_id
|
||||||
|
next_user_id += 1
|
||||||
|
name_e = sql_escape(author_name)
|
||||||
|
sql = f"""
|
||||||
|
INSERT INTO users (id, username, email, created, role)
|
||||||
|
VALUES ({uid}, '{name_e}', '{name_e}@imported.local', NOW(), 'developer')
|
||||||
|
ON CONFLICT (id) DO NOTHING;
|
||||||
|
"""
|
||||||
|
if psql(sql):
|
||||||
|
author_user_ids[author_name] = uid
|
||||||
|
else:
|
||||||
|
author_user_ids[author_name] = ADMIN_USER_ID
|
||||||
|
return author_user_ids[author_name]
|
||||||
|
|
||||||
|
|
||||||
|
def import_project(hit, counter):
|
||||||
|
slug = hit.get("slug", "")
|
||||||
|
if slug in seen_slugs:
|
||||||
|
return False
|
||||||
|
seen_slugs.add(slug)
|
||||||
|
|
||||||
|
title = hit.get("title", "")
|
||||||
|
summary = hit.get("description", "")[:2048]
|
||||||
|
project_id_api = hit.get("project_id", "")
|
||||||
|
downloads = hit.get("downloads", 0)
|
||||||
|
follows = hit.get("follows", 0)
|
||||||
|
icon_url = hit.get("icon_url") or None
|
||||||
|
author_name = hit.get("author", "Unknown")
|
||||||
|
|
||||||
|
print(f" Fetching: {title}")
|
||||||
|
try:
|
||||||
|
project_data = api_get(f"{API_BASE}/project/{project_id_api}")
|
||||||
|
description = (project_data.get("body") or "")[:65536]
|
||||||
|
icon_url = project_data.get("icon_url") or icon_url
|
||||||
|
except Exception:
|
||||||
|
description = summary
|
||||||
|
|
||||||
|
author_id = get_or_create_author_user(author_name)
|
||||||
|
|
||||||
|
base = int(time.time() * 1e9) % 900_000_000_000_000 + 100_000_000_000_000
|
||||||
|
mod_id = base + counter * 5
|
||||||
|
team_id = base + counter * 5 + 1
|
||||||
|
member_id = base + counter * 5 + 2
|
||||||
|
version_id = base + counter * 5 + 3
|
||||||
|
|
||||||
|
title_e = sql_escape(title)
|
||||||
|
summary_e = sql_escape(summary)
|
||||||
|
description_e = sql_escape(description)
|
||||||
|
slug_e = sql_escape(slug)
|
||||||
|
icon_col = f"'{sql_escape(icon_url)}'" if icon_url else "NULL"
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" Importing: {title} (author={author_name}, downloads={downloads}, followers={follows})"
|
||||||
|
)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
BEGIN;
|
||||||
|
|
||||||
|
INSERT INTO teams (id) VALUES ({team_id});
|
||||||
|
|
||||||
|
INSERT INTO mods (
|
||||||
|
id, team_id, name, summary, description,
|
||||||
|
published, downloads, follows,
|
||||||
|
status, license, side_types_migration_review_status,
|
||||||
|
components, monetization_status, slug,
|
||||||
|
icon_url, raw_icon_url
|
||||||
|
) VALUES (
|
||||||
|
{mod_id},
|
||||||
|
{team_id},
|
||||||
|
'{title_e}',
|
||||||
|
'{summary_e}',
|
||||||
|
'{description_e}',
|
||||||
|
NOW(),
|
||||||
|
{downloads},
|
||||||
|
{follows},
|
||||||
|
'approved',
|
||||||
|
'LicenseRef-All-Rights-Reserved',
|
||||||
|
'reviewed',
|
||||||
|
'{{}}'::jsonb,
|
||||||
|
'monetized',
|
||||||
|
LOWER('{slug_e}'),
|
||||||
|
{icon_col},
|
||||||
|
{icon_col}
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO team_members (
|
||||||
|
id, team_id, user_id, role, permissions,
|
||||||
|
accepted, payouts_split, ordering, is_owner
|
||||||
|
) VALUES (
|
||||||
|
{member_id},
|
||||||
|
{team_id},
|
||||||
|
{author_id},
|
||||||
|
'Owner',
|
||||||
|
1275068466,
|
||||||
|
true,
|
||||||
|
1.00000000000000000000,
|
||||||
|
0,
|
||||||
|
true
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO versions (
|
||||||
|
id, mod_id, name, version_number, version_type,
|
||||||
|
author_id, downloads, changelog, status, components
|
||||||
|
) VALUES (
|
||||||
|
{version_id},
|
||||||
|
{mod_id},
|
||||||
|
'1.0.0',
|
||||||
|
'1.0.0',
|
||||||
|
'release',
|
||||||
|
{author_id},
|
||||||
|
{downloads},
|
||||||
|
'',
|
||||||
|
'listed',
|
||||||
|
'{{}}'::jsonb
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO loaders_versions (loader_id, version_id) VALUES (2, {version_id});
|
||||||
|
|
||||||
|
COMMIT;
|
||||||
|
"""
|
||||||
|
return psql(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def mode_search(query, limit=5):
|
||||||
|
encoded_query = urllib.parse.quote(query)
|
||||||
|
search_url = f"{API_BASE}/search?query={encoded_query}&limit={limit}&facets=[]"
|
||||||
|
print(f"Searching Modrinth for: {query} (limit: {limit})")
|
||||||
|
|
||||||
|
search_data = api_get(search_url)
|
||||||
|
hits = search_data.get("hits", [])
|
||||||
|
|
||||||
|
if not hits:
|
||||||
|
print("No results found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
imported = 0
|
||||||
|
for i, hit in enumerate(hits):
|
||||||
|
if import_project(hit, i):
|
||||||
|
imported += 1
|
||||||
|
|
||||||
|
print(f"Done. Imported {imported} project(s).")
|
||||||
|
|
||||||
|
|
||||||
|
def mode_top(count=1000):
|
||||||
|
print(f"Fetching top {count} projects by downloads from Modrinth...")
|
||||||
|
|
||||||
|
imported = 0
|
||||||
|
batch_size = 50
|
||||||
|
counter = 0
|
||||||
|
|
||||||
|
for offset in range(0, count, batch_size):
|
||||||
|
limit = min(batch_size, count - offset)
|
||||||
|
url = (
|
||||||
|
f"{API_BASE}/search?limit={limit}&offset={offset}&index=downloads&facets=[]"
|
||||||
|
)
|
||||||
|
print(f"\n Batch offset={offset}, limit={limit}")
|
||||||
|
|
||||||
|
data = api_get(url)
|
||||||
|
hits = data.get("hits", [])
|
||||||
|
|
||||||
|
if not hits:
|
||||||
|
break
|
||||||
|
|
||||||
|
for hit in hits:
|
||||||
|
if import_project(hit, counter):
|
||||||
|
imported += 1
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
print(f"\nDone. Imported {imported} project(s).")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print(f"Usage: {sys.argv[0]} search <query> [limit]")
|
||||||
|
print(f" {sys.argv[0]} top [count]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
mode = sys.argv[1]
|
||||||
|
|
||||||
|
if mode == "search":
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: {sys.argv[0]} search <query> [limit]")
|
||||||
|
sys.exit(1)
|
||||||
|
query = sys.argv[2]
|
||||||
|
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
|
||||||
|
mode_search(query, limit)
|
||||||
|
elif mode == "top":
|
||||||
|
count = int(sys.argv[2]) if len(sys.argv) > 2 else 1000
|
||||||
|
mode_top(count)
|
||||||
|
else:
|
||||||
|
print(f"Unknown mode: {mode}. Use 'search' or 'top'.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user