mirror of
https://github.com/mealie-recipes/mealie.git
synced 2026-02-03 06:23:10 -05:00
feat: search tokenization, handling of quoted literal search, and postgres fuzziness (#2351)
* Creating postgres migration script and starting to set up to detect database * non-working placeholders for postgres pg_tgrm * First draft of some indexes * non-working commit of postgres indexing * Further non-working edits to db-centric fuzzy search * update alembic for extensions * More non-working setup * Move db type check to init_db * fix typo in db name check * Add sqlite token search and postgres full text search * reorder search to hit exact matches faster * Add settings and docs for POSTGRES_LANGUAGE (full text search) * Use user-specified POSTGRES_LANGUAGE in search * fix fuzzy search typo * Remove full text search and instead order by trigram match * cleaner adding of indices, remove fulltext * Cleanup old import of getting app settings * Fix typo in index * Fix some alembic fuzzy typos * Remove diagnostic printing from alembic migration * Fix mixed up commutator for trigram operator and relax criteria * forgot to remove query debug * sort only on name * token and fuzzy search tests * Refactor recipe search test to avoid rare random string cross-matches. * Add ability to quote parts of search for exact match * Remove internal punctuation, unless it's quoted for literal search * Add tests for special character removal and literal search * Remove the outer double quotes from searches, but leave internal single quotes alone. * Update tests to avoid intra-test name collisions * Fixing leftovers highlighted by lint * cleanup linting and mypy errors * Fix test cross-matching on dirty db (leftovers from bulk import) * forgot to cleanup something when debugging mypy errors * re-order pg_trgm loading in postgres * address comments
This commit is contained in:
@@ -92,6 +92,9 @@ def main():
|
||||
logger.info("Migration needed. Performing migration...")
|
||||
command.upgrade(alembic_cfg, "head")
|
||||
|
||||
if session.get_bind().name == "postgresql": # needed for fuzzy search and fast GIN text indices
|
||||
session.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;"))
|
||||
|
||||
db = get_repositories(session)
|
||||
|
||||
if db.users.get_all():
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import Boolean, Float, ForeignKey, Integer, String, event, orm
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.orm.session import Session
|
||||
from text_unidecode import unidecode
|
||||
|
||||
from mealie.db.models._model_base import BaseMixins, SqlAlchemyBase
|
||||
@@ -87,7 +89,7 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins):
|
||||
original_text_normalized: Mapped[str | None] = mapped_column(String, index=True)
|
||||
|
||||
@auto_init()
|
||||
def __init__(self, note: str | None = None, orginal_text: str | None = None, **_) -> None:
|
||||
def __init__(self, session: Session, note: str | None = None, orginal_text: str | None = None, **_) -> None:
|
||||
# SQLAlchemy events do not seem to register things that are set during auto_init
|
||||
if note is not None:
|
||||
self.note_normalized = unidecode(note).lower().strip()
|
||||
@@ -95,13 +97,51 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins):
|
||||
if orginal_text is not None:
|
||||
self.orginal_text = unidecode(orginal_text).lower().strip()
|
||||
|
||||
tableargs = [ # base set of indices
|
||||
sa.Index(
|
||||
"ix_recipes_ingredients_note_normalized",
|
||||
"note_normalized",
|
||||
unique=False,
|
||||
),
|
||||
sa.Index(
|
||||
"ix_recipes_ingredients_original_text_normalized",
|
||||
"original_text_normalized",
|
||||
unique=False,
|
||||
),
|
||||
]
|
||||
if session.get_bind().name == "postgresql":
|
||||
tableargs.extend(
|
||||
[
|
||||
sa.Index(
|
||||
"ix_recipes_ingredients_note_normalized_gin",
|
||||
"note_normalized",
|
||||
unique=False,
|
||||
postgresql_using="gin",
|
||||
postgresql_ops={
|
||||
"note_normalized": "gin_trgm_ops",
|
||||
},
|
||||
),
|
||||
sa.Index(
|
||||
"ix_recipes_ingredients_original_text_normalized_gin",
|
||||
"original_text",
|
||||
unique=False,
|
||||
postgresql_using="gin",
|
||||
postgresql_ops={
|
||||
"original_text_normalized": "gin_trgm_ops",
|
||||
},
|
||||
),
|
||||
]
|
||||
)
|
||||
# add indices
|
||||
self.__table_args__ = tuple(tableargs)
|
||||
|
||||
|
||||
@event.listens_for(RecipeIngredientModel.note, "set")
|
||||
def receive_note(target: RecipeIngredientModel, value: str, oldvalue, initiator):
|
||||
if value is not None:
|
||||
target.name_normalized = unidecode(value).lower().strip()
|
||||
target.note_normalized = unidecode(value).lower().strip()
|
||||
else:
|
||||
target.name_normalized = None
|
||||
target.note_normalized = None
|
||||
|
||||
|
||||
@event.listens_for(RecipeIngredientModel.original_text, "set")
|
||||
|
||||
@@ -35,7 +35,9 @@ if TYPE_CHECKING:
|
||||
|
||||
class RecipeModel(SqlAlchemyBase, BaseMixins):
|
||||
__tablename__ = "recipes"
|
||||
__table_args__ = (sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),)
|
||||
__table_args__: tuple[sa.UniqueConstraint, ...] = (
|
||||
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
|
||||
)
|
||||
|
||||
id: Mapped[GUID] = mapped_column(GUID, primary_key=True, default=GUID.generate)
|
||||
slug: Mapped[str | None] = mapped_column(sa.String, index=True)
|
||||
@@ -192,6 +194,46 @@ class RecipeModel(SqlAlchemyBase, BaseMixins):
|
||||
if description is not None:
|
||||
self.description_normalized = unidecode(description).lower().strip()
|
||||
|
||||
tableargs = [ # base set of indices
|
||||
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
|
||||
sa.Index(
|
||||
"ix_recipes_name_normalized",
|
||||
"name_normalized",
|
||||
unique=False,
|
||||
),
|
||||
sa.Index(
|
||||
"ix_recipes_description_normalized",
|
||||
"description_normalized",
|
||||
unique=False,
|
||||
),
|
||||
]
|
||||
|
||||
if session.get_bind().name == "postgresql":
|
||||
tableargs.extend(
|
||||
[
|
||||
sa.Index(
|
||||
"ix_recipes_name_normalized_gin",
|
||||
"name_normalized",
|
||||
unique=False,
|
||||
postgresql_using="gin",
|
||||
postgresql_ops={
|
||||
"name_normalized": "gin_trgm_ops",
|
||||
},
|
||||
),
|
||||
sa.Index(
|
||||
"ix_recipes_description_normalized_gin",
|
||||
"description_normalized",
|
||||
unique=False,
|
||||
postgresql_using="gin",
|
||||
postgresql_ops={
|
||||
"description_normalized": "gin_trgm_ops",
|
||||
},
|
||||
),
|
||||
]
|
||||
)
|
||||
# add indices
|
||||
self.__table_args__ = tuple(tableargs)
|
||||
|
||||
|
||||
@event.listens_for(RecipeModel.name, "set")
|
||||
def receive_name(target: RecipeModel, value: str, oldvalue, initiator):
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import re as re
|
||||
from collections.abc import Sequence
|
||||
from random import randint
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import UUID4
|
||||
from slugify import slugify
|
||||
from sqlalchemy import Select, and_, desc, func, or_, select
|
||||
from sqlalchemy import Select, and_, desc, func, or_, select, text
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.orm import joinedload
|
||||
from text_unidecode import unidecode
|
||||
@@ -151,29 +152,94 @@ class RepositoryRecipes(RepositoryGeneric[Recipe, RecipeModel]):
|
||||
return ids + additional_ids
|
||||
|
||||
def _add_search_to_query(self, query: Select, search: str) -> Select:
|
||||
"""
|
||||
0. fuzzy search (postgres only) and tokenized search are performed separately
|
||||
1. take search string and do a little pre-normalization
|
||||
2. look for internal quoted strings and keep them together as "literal" parts of the search
|
||||
3. remove special characters from each non-literal search string
|
||||
4. token search looks for any individual exact hit in name, description, and ingredients
|
||||
5. fuzzy search looks for trigram hits in name, description, and ingredients
|
||||
6. Sort order is determined by closeness to the recipe name
|
||||
Should search also look at tags?
|
||||
"""
|
||||
|
||||
normalized_search = unidecode(search).lower().strip()
|
||||
punctuation = "!\#$%&()*+,-./:;<=>?@[\\]^_`{|}~" # string.punctuation with ' & " removed
|
||||
# keep quoted phrases together as literal portions of the search string
|
||||
literal = False
|
||||
quoted_regex = re.compile(r"""(["'])(?:(?=(\\?))\2.)*?\1""") # thank you stack exchange!
|
||||
removequotes_regex = re.compile(r"""['"](.*)['"]""")
|
||||
if quoted_regex.search(normalized_search):
|
||||
literal = True
|
||||
temp = normalized_search
|
||||
quoted_search_list = [match.group() for match in quoted_regex.finditer(temp)] # all quoted strings
|
||||
quoted_search_list = [removequotes_regex.sub("\\1", x) for x in quoted_search_list] # remove outer quotes
|
||||
temp = quoted_regex.sub("", temp) # remove all quoted strings, leaving just non-quoted
|
||||
temp = temp.translate(
|
||||
str.maketrans(punctuation, " " * len(punctuation))
|
||||
) # punctuation->spaces for splitting, but only on unquoted strings
|
||||
unquoted_search_list = temp.split() # all unquoted strings
|
||||
normalized_search_list = quoted_search_list + unquoted_search_list
|
||||
else:
|
||||
#
|
||||
normalized_search = normalized_search.translate(str.maketrans(punctuation, " " * len(punctuation)))
|
||||
normalized_search_list = normalized_search.split()
|
||||
normalized_search_list = [x.strip() for x in normalized_search_list] # remove padding whitespace inside quotes
|
||||
# I would prefer to just do this in the recipe_ingredient.any part of the main query, but it turns out
|
||||
# that at least sqlite wont use indexes for that correctly anymore and takes a big hit, so prefiltering it is
|
||||
ingredient_ids = (
|
||||
self.session.execute(
|
||||
select(RecipeIngredientModel.id).filter(
|
||||
or_(
|
||||
RecipeIngredientModel.note_normalized.like(f"%{normalized_search}%"),
|
||||
RecipeIngredientModel.original_text_normalized.like(f"%{normalized_search}%"),
|
||||
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
|
||||
ingredient_ids = (
|
||||
self.session.execute(
|
||||
select(RecipeIngredientModel.id).filter(
|
||||
or_(
|
||||
RecipeIngredientModel.note_normalized.op("%>")(normalized_search),
|
||||
RecipeIngredientModel.original_text_normalized.op("%>")(normalized_search),
|
||||
)
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
else: # exact token search
|
||||
ingredient_ids = (
|
||||
self.session.execute(
|
||||
select(RecipeIngredientModel.id).filter(
|
||||
or_(
|
||||
*[RecipeIngredientModel.note_normalized.like(f"%{ns}%") for ns in normalized_search_list],
|
||||
*[
|
||||
RecipeIngredientModel.original_text_normalized.like(f"%{ns}%")
|
||||
for ns in normalized_search_list
|
||||
],
|
||||
)
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
q = query.filter(
|
||||
or_(
|
||||
RecipeModel.name_normalized.like(f"%{normalized_search}%"),
|
||||
RecipeModel.description_normalized.like(f"%{normalized_search}%"),
|
||||
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
|
||||
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
|
||||
# default = 0.7 is too strict for effective fuzzing
|
||||
self.session.execute(text("set pg_trgm.word_similarity_threshold = 0.5;"))
|
||||
q = query.filter(
|
||||
or_(
|
||||
RecipeModel.name_normalized.op("%>")(normalized_search),
|
||||
RecipeModel.description_normalized.op("%>")(normalized_search),
|
||||
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
|
||||
)
|
||||
).order_by( # trigram ordering could be too slow on million record db, but is fine with thousands.
|
||||
func.least(
|
||||
RecipeModel.name_normalized.op("<->>")(normalized_search),
|
||||
)
|
||||
)
|
||||
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))
|
||||
else: # exact token search
|
||||
q = query.filter(
|
||||
or_(
|
||||
*[RecipeModel.name_normalized.like(f"%{ns}%") for ns in normalized_search_list],
|
||||
*[RecipeModel.description_normalized.like(f"%{ns}%") for ns in normalized_search_list],
|
||||
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
|
||||
)
|
||||
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))
|
||||
|
||||
return q
|
||||
|
||||
def page_all(
|
||||
|
||||
Reference in New Issue
Block a user