2020-06-26 19:36:31 -05:00
#!/usr/bin/env python
""" Scrape a recipe, convert it to Markdown and store it in a Zettelkasten.
A free recipe - box .
1. https : / / obsidian . md /
2. https : / / www . ourstate . com / a - kitchens - riches /
"""
2023-07-01 06:36:55 -05:00
import argparse
2020-06-26 19:36:31 -05:00
import json
import os
import sys
2023-07-01 06:36:55 -05:00
try :
import httpx
2023-07-01 06:45:15 -05:00
url_getter = httpx . Client ( http2 = True )
2023-07-01 06:36:55 -05:00
except ImportError :
import requests
url_getter = requests
2020-06-27 07:12:03 -05:00
from recipe_scrapers import scrape_me , WebsiteNotImplementedError , SCRAPERS
2020-06-26 19:36:31 -05:00
2020-06-27 14:05:33 -05:00
ROOT = ' ~/.config/recipe_box/ '
2020-06-26 19:36:31 -05:00
def ensure_directory_exists ( path , expand_user = True , file = False ) :
""" Create a directory if it doesn ' t exists.
Expanding ' ~ ' to the user ' s home directory on POSIX systems.
"""
if expand_user :
path = os . path . expanduser ( path )
if file :
directory = os . path . dirname ( path )
else :
directory = path
if not os . path . exists ( directory ) and directory :
try :
os . makedirs ( directory )
except OSError as e :
# A parallel process created the directory after the existence check.
pass
2023-07-01 06:36:55 -05:00
return path
2020-06-26 19:36:31 -05:00
def valid_filename ( directory , filename = None , ascii = False ) :
""" Return a valid " new " filename in a directory, given a filename/directory=path to test.
Deal with duplicate filenames .
"""
def test_filename ( filename , count ) :
""" Filename to test for existence.
"""
fn , ext = os . path . splitext ( filename )
return fn + ' ( {} ) ' . format ( count ) + ext
return_path = filename is None
# Directory is a path.
if filename is None :
filename = os . path . basename ( directory )
directory = os . path . dirname ( directory )
# if ascii:
# filename = unidecode(unicode(filename))
# filename = ' '.join(filename.splitlines()).strip()
# filename = filename.decode('ascii', 'ignore')
# Allow for directories.
2023-07-01 06:36:55 -05:00
items = set ( os . listdir ( directory ) )
2020-06-26 19:36:31 -05:00
if filename in items :
count = 1
while test_filename ( filename , count ) in items :
count + = 1
if return_path :
return os . path . join ( directory , test_filename ( filename , count ) )
return test_filename ( filename , count )
else :
if return_path :
return os . path . join ( directory , filename )
return filename
2023-07-01 06:36:55 -05:00
def process_recipe ( config , scraper , url , verbose = False ) :
""" Process the recipe at a given URL.
2020-06-27 14:05:33 -05:00
"""
2023-07-01 06:36:55 -05:00
recipe_box = ensure_directory_exists ( config [ ' recipe_box ' ] )
media = ensure_directory_exists ( os . path . join ( config [ ' recipe_box ' ] , ' media ' ) )
prefix = scraper . title ( ) . lower ( )
path = os . path . join ( recipe_box , prefix + ' .md ' )
path = valid_filename ( path )
recipe = open ( path , ' w ' )
try :
image_url = scraper . image ( )
response = url_getter . get ( image_url )
except :
filename = None
else :
# Not sure about image urls without filename extensions, might need python-magic.
# Also, os.path.splitext(url), probably not a good idea. ;)
filename = os . path . splitext ( os . path . basename ( path ) ) [ 0 ] + os . path . splitext ( scraper . image ( ) ) [ 1 ]
filepath = os . path . join ( media , filename )
image = open ( filepath , ' wb ' )
image . write ( response . content )
image . close ( )
if verbose :
print ( ' Saving {url} -> {path} ' . format ( url = image_url , path = filepath ) )
recipe . write ( ' # {title} \n ' . format ( title = scraper . title ( ) ) )
if filename :
recipe . write ( ' ![[ {filename} ]] \n ' . format ( filename = filename ) )
recipe . write ( ' \n ' )
# This is a placeholder for the user's own notes about the recipe.
recipe . write ( ' ## Notes \n ' )
recipe . write ( ' \n ' )
recipe . write ( ' ## Metadata \n ' )
recipe . write ( ' Yields: {yields} \n ' . format ( yields = scraper . yields ( ) ) )
recipe . write ( ' Total Time: {total_time} \n ' . format ( total_time = scraper . total_time ( ) ) )
recipe . write ( ' \n ' )
recipe . write ( ' ## Ingredients \n ' )
for ingredient in scraper . ingredients ( ) :
recipe . write ( ' * {ingredient} \n ' . format ( ingredient = ingredient ) )
recipe . write ( ' \n ' )
recipe . write ( ' ## Instructions \n ' )
for instruction in scraper . instructions ( ) . split ( ' \n ' ) :
instruction = instruction . strip ( )
if instruction :
if instruction [ 0 ] . isdigit ( ) :
recipe . write ( ' {instruction} \n ' . format ( instruction = instruction ) )
else :
recipe . write ( ' 1. {instruction} \n ' . format ( instruction = instruction ) )
2020-06-27 07:12:03 -05:00
2023-07-01 06:36:55 -05:00
recipe . write ( ' \n ' )
recipe . write ( ' [ {url} ]( {url} ) \n ' . format ( url = url ) )
recipe . close ( )
# if verbose:
print ( ' Saving {url} -> {path} ' . format ( url = url , path = path ) )
2020-06-27 07:12:03 -05:00
2020-06-26 19:36:31 -05:00
2023-07-01 06:36:55 -05:00
def main ( ) :
""" Console script entry point.
"""
parser = argparse . ArgumentParser ( )
parser . add_argument ( ' url ' , metavar = ' URL ' , type = str , nargs = ' * ' , default = ' ' , help = ' recipe url ' )
parser . add_argument ( ' -l ' , dest = ' list ' , action = ' store_true ' , default = False , help = ' list all available sites ' )
parser . add_argument ( ' -w ' , dest = ' wild_mode ' , action = ' store_true ' , default = False , help = " try scraping ' unknown ' site using wild-mode (some editing of the recipe might be required) " )
parser . add_argument ( ' -v ' , dest = ' verbose ' , action = ' store_true ' , default = False , help = ' verbose output ' )
args = parser . parse_args ( )
if args . list :
2020-06-27 07:12:03 -05:00
for host in sorted ( SCRAPERS ) :
print ( host )
sys . exit ( )
2023-07-01 06:36:55 -05:00
wild_mode = args . wild_mode
verbose = args . verbose
2020-06-27 14:05:33 -05:00
config_path = ensure_directory_exists ( os . path . join ( ROOT , ' recipe_box.json ' ) , file = True )
2020-06-26 19:36:31 -05:00
if not os . path . exists ( config_path ) :
config = { ' recipe_box ' : ' ~/recipe_box/ ' }
with open ( config_path , ' w ' ) as f :
2020-06-27 14:05:33 -05:00
json . dump ( config , f , indent = 4 )
2020-06-26 19:36:31 -05:00
else :
with open ( config_path , ' r ' ) as f :
config = json . load ( f )
2023-07-01 06:36:55 -05:00
for url in args . url :
if url :
2020-06-26 19:36:31 -05:00
try :
2023-07-01 06:36:55 -05:00
scraper = scrape_me ( url , wild_mode = wild_mode )
except WebsiteNotImplementedError :
print ( ' No scraper defined for {url} ' . format ( url = url ) )
print ( ' Try using the -w [wild-mode] option, your mileage may vary. ' )
print ( ' ' )
print ( ' It is recommended you add it to recipe-scrapers site, that way everybody gains from the effort. ' )
print ( ' https://github.com/hhursev/recipe-scrapers#if-you-want-a-scraper-for-a-new-site-added ' )
print ( ' ' )
print ( ' Once someone has added the new scraper: ' )
print ( ' pip install --upgrade recipe-scrapers ' )
2020-06-26 19:36:31 -05:00
else :
2023-07-01 06:36:55 -05:00
process_recipe ( config , scraper , url , verbose )
2020-06-27 14:05:33 -05:00
if __name__ == ' __main__ ' :
main ( )
2023-07-01 06:36:55 -05:00