Files
mealie/mealie/services/scraper/ingredient_nlp/pre_processor.py
2021-08-29 17:10:51 -08:00

98 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import unicodedata
replace_abbreviations = {
"cup ": "cup ",
"g ": "gram ",
"kg ": "kilogram ",
"lb ": "pound ",
"ml ": "milliliter ",
"oz ": "ounce ",
"pint ": "pint ",
"qt ": "quart ",
"tbs ": "tablespoon ",
"tbsp ": "tablespoon ",
"tsp ": "teaspoon ",
}
def replace_common_abbreviations(string: str) -> str:
for k, v in replace_abbreviations.items():
string = string.replace(k, v)
return string
def remove_periods(string: str) -> str:
"""Removes periods not sournded by digets"""
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
def replace_fraction_unicode(string: str):
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
# TODO: Breaks on multiple unicode fractions
for c in string:
try:
name = unicodedata.name(c)
except ValueError:
continue
if name.startswith("VULGAR FRACTION"):
normalized = unicodedata.normalize("NFKC", c)
numerator, _slash, denominator = normalized.partition("")
text = f" {numerator}/{denominator}"
return string.replace(c, text).replace(" ", " ")
return string
def wrap_or_clause(string: str):
"""
Attempts to wrap or clauses in ()
Examples:
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
"""
# TODO: Needs more adequite testing to be sure this doens't have side effects.
split_by_or = string.split(" or ")
split_by_comma = split_by_or[1].split(",")
if len(split_by_comma) > 0:
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
return string
def pre_process_string(string: str) -> str:
"""
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
{qty} {unit} {food}, {additional}
1 tbs. wine, expensive or other white wine, plus more
"""
string = string.lower()
string = replace_fraction_unicode(string)
string = remove_periods(string)
string = replace_common_abbreviations(string)
if " or " in string:
string = wrap_or_clause(string)
return string
def main():
# TODO: Migrate to unittests
print("Starting...")
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
print(pre_process_string("¼ cup michiu tou or other rice wine"))
print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
print("Finished...")
if __name__ == "__main__":
main()