Normalize recipe instructions, yield, and image url

Including tests and example recipe data.
This commit is contained in:
Richard Mitic
2021-01-05 19:13:37 +01:00
parent 838a587ca1
commit 96b9f74f84
18 changed files with 1363 additions and 6 deletions

View File

@@ -1,3 +1,5 @@
from typing import List
import json
from pathlib import Path
@@ -12,10 +14,44 @@ CWD = Path(__file__).parent
TEMP_FILE = CWD.parent.joinpath("data", "debug", "last_recipe.json")
def normalize_data(recipe_data: dict) -> dict:
if type(recipe_data["recipeYield"]) == list:
recipe_data["recipeYield"] = recipe_data["recipeYield"][0]
def normalize_image_url(image) -> str:
if type(image) == list:
return image[0]
elif type(image) == dict:
return image['url']
elif type(image) == str:
return image
else:
raise Exception(f"Unrecognised image URL format: {image}")
def normalize_instructions(instructions) -> List[dict]:
# One long string split by (possibly multiple) new lines
if type(instructions) == str:
return [{"text": line.strip()} for line in filter(None, instructions.split("\n"))]
# Plain strings in a list
elif type(instructions) == list and type(instructions[0]) == str:
return [{"text": step.strip()} for step in instructions]
# Dictionaries (let's assume it's a HowToStep) in a list
elif type(instructions) == list and type(instructions[0]) == dict:
return [{"text": step['text'].strip()} for step in instructions if step['@type'] == 'HowToStep']
else:
raise Exception(f"Unrecognised instruction format: {instructions}")
def normalize_yield(yld) -> str:
if type(yld) == list:
return yld[-1]
else:
return yld
def normalize_data(recipe_data: dict) -> dict:
recipe_data["recipeYield"] = normalize_yield(recipe_data.get("recipeYield"))
recipe_data["recipeInstructions"] = normalize_instructions(recipe_data["recipeInstructions"])
return recipe_data
@@ -52,7 +88,7 @@ def process_recipe_url(url: str) -> dict:
new_recipe.update(mealie_tags)
try:
img_path = scrape_image(new_recipe.get("image"), slug)
img_path = scrape_image(normalize_image_url(new_recipe.get("image")), slug)
new_recipe["image"] = img_path.name
except:
new_recipe["image"] = None