Files
mealie/mealie/services/recipe/recipe_data_service.py
2024-08-22 10:14:32 -05:00

162 lines
5.7 KiB
Python

import asyncio
import shutil
from pathlib import Path
from httpx import AsyncClient, Response
from pydantic import UUID4
from mealie.pkgs import img, safehttp
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
from mealie.schema.recipe.recipe import Recipe
from mealie.services._base_service import BaseService
try:
from recipe_scrapers._abstract import HEADERS
_FIREFOX_UA = HEADERS["User-Agent"]
except (ImportError, KeyError):
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"
async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
semaphore = asyncio.Semaphore(n)
async def sem_coro(coro):
async with semaphore:
return await coro
results = await asyncio.gather(*(sem_coro(c) for c in coros), return_exceptions=ignore_exceptions)
if ignore_exceptions:
results = [r for r in results if not isinstance(r, Exception)]
return results
async def largest_content_len(urls: list[str]) -> tuple[str, int]:
largest_url = ""
largest_len = 0
max_concurrency = 10
async def do(client: AsyncClient, url: str) -> Response:
return await client.head(url, headers={"User-Agent": _FIREFOX_UA})
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
tasks = [do(client, url) for url in urls]
responses: list[Response] = await gather_with_concurrency(max_concurrency, *tasks, ignore_exceptions=True)
for response in responses:
len_int = int(response.headers.get("Content-Length", 0))
if len_int > largest_len:
largest_url = str(response.url)
largest_len = len_int
return largest_url, largest_len
class NotAnImageError(Exception):
pass
class InvalidDomainError(Exception):
pass
class RecipeDataService(BaseService):
minifier: img.ABCMinifier
def __init__(self, recipe_id: UUID4) -> None:
"""
RecipeDataService is a service that consolidates the reading/writing actions related
to assets, and images for a recipe.
"""
super().__init__()
self.recipe_id = recipe_id
self.minifier = img.PillowMinifier(purge=True, logger=self.logger)
self.dir_data = Recipe.directory_from_id(self.recipe_id)
self.dir_image = self.dir_data.joinpath("images")
self.dir_image_timeline = self.dir_image.joinpath("timeline")
self.dir_assets = self.dir_data.joinpath("assets")
for dir in [self.dir_image, self.dir_image_timeline, self.dir_assets]:
dir.mkdir(parents=True, exist_ok=True)
def delete_all_data(self) -> None:
try:
shutil.rmtree(self.dir_data)
except Exception as e:
self.logger.exception(f"Failed to delete recipe data: {e}")
def write_image(self, file_data: bytes | Path, extension: str, image_dir: Path | None = None) -> Path:
if not image_dir:
image_dir = self.dir_image
extension = extension.replace(".", "")
image_path = image_dir.joinpath(f"original.{extension}")
image_path.unlink(missing_ok=True)
if isinstance(file_data, Path):
shutil.copy2(file_data, image_path)
elif isinstance(file_data, bytes):
with open(image_path, "ab") as f:
f.write(file_data)
else:
with open(image_path, "ab") as f:
shutil.copyfileobj(file_data, f)
self.minifier.minify(image_path)
return image_path
async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
self.logger.info(f"Image URL: {image_url}")
image_url_str = ""
if isinstance(image_url, str): # Handles String Types
image_url_str = image_url
elif isinstance(image_url, list): # Handles List Types
# Multiple images have been defined in the schema - usually different resolutions
# Typically would be in smallest->biggest order, but can't be certain so test each.
# 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.'
image_url_str, _ = await largest_content_len(image_url)
elif isinstance(image_url, dict): # Handles Dictionary Types
for key in image_url:
if key == "url":
image_url_str = image_url.get("url", "")
if not image_url_str:
raise ValueError(f"image url could not be parsed from input: {image_url}")
ext = image_url_str.split(".")[-1]
if ext not in img.IMAGE_EXTENSIONS:
ext = "jpg" # Guess the extension
file_name = f"{self.recipe_id!s}.{ext}"
file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name)
async with AsyncClient(transport=AsyncSafeTransport()) as client:
try:
r = await client.get(image_url_str, headers={"User-Agent": _FIREFOX_UA})
except Exception:
self.logger.exception("Fatal Image Request Exception")
return None
if r.status_code != 200:
# TODO: Probably should throw an exception in this case as well, but before these changes
# we were returning None if it failed anyways.
return None
content_type = r.headers.get("content-type", "")
if "image" not in content_type:
self.logger.error(f"Content-Type: {content_type} is not an image")
raise NotAnImageError(f"Content-Type {content_type} is not an image")
self.logger.debug(f"File Name Suffix {file_path.suffix}")
self.write_image(r.read(), file_path.suffix)
file_path.unlink(missing_ok=True)