recipe-box-for-wikijs/recipe_box.py

#!/usr/bin/env python

""" Scrape a recipe, convert it to Markdown and store it in a Zettelkasten.

    A free recipe-box.

    1. https://obsidian.md/
    2. https://www.ourstate.com/a-kitchens-riches/
"""
import argparse
import json
import os
import sys

try:
    import httpx
    url_getter = httpx.Client(http2=True)
except ImportError:
    import requests
    url_getter = requests

from recipe_scrapers import scrape_me, WebsiteNotImplementedError, SCRAPERS


ROOT = '~/.config/recipe_box/'


def ensure_directory_exists(path, expand_user=True, file=False):
    """ Create a directory if it doesn't exists.

        Expanding '~' to the user's home directory on POSIX systems.
    """
    if expand_user:
        path = os.path.expanduser(path)

    if file:
        directory = os.path.dirname(path)
    else:
        directory = path

    if not os.path.exists(directory) and directory:
        try:
            os.makedirs(directory)
        except OSError as e:
            # A parallel process created the directory after the existence check.
            pass

    return path


def valid_filename(directory, filename=None, ascii=False):
    """ Return a valid "new" filename in a directory, given a filename/directory=path to test.

        Deal with duplicate filenames.
    """
    def test_filename(filename, count):
        """ Filename to test for existence.
        """
        fn, ext = os.path.splitext(filename)
        return fn + '({})'.format(count) + ext

    return_path = filename is None

    # Directory is a path.
    if filename is None:
        filename = os.path.basename(directory)
        directory = os.path.dirname(directory)

    # if ascii:
    #     filename = unidecode(unicode(filename))
    #     filename = ' '.join(filename.splitlines()).strip()
    #     filename = filename.decode('ascii', 'ignore')

    # Allow for directories.
    items = set(os.listdir(directory))
    if filename in items:
        count = 1
        while test_filename(filename, count) in items:
            count += 1
        if return_path:
            return os.path.join(directory, test_filename(filename, count))
        return test_filename(filename, count)
    else:
        if return_path:
            return os.path.join(directory, filename)
        return filename


def process_recipe(config, scraper, url, verbose=False):
    """ Process the recipe at a given URL.
    """
    recipe_box = ensure_directory_exists(config['recipe_box'])
    media = ensure_directory_exists(os.path.join(config['recipe_box'], 'media'))

    prefix = scraper.title().lower()
    path = os.path.join(recipe_box, prefix + '.md')
    path = valid_filename(path)
    recipe = open(path, 'w')

    try:
        image_url = scraper.image()
        response = url_getter.get(image_url)
    except:
        filename = None
    else:
        # Not sure about image urls without filename extensions, might need python-magic.
        # Also, os.path.splitext(url), probably not a good idea. ;)
        filename = os.path.splitext(os.path.basename(path))[0] + os.path.splitext(scraper.image())[1]
        filepath = os.path.join(media, filename)
        image = open(filepath, 'wb')
        image.write(response.content)
        image.close()
        if verbose:
            print('Saving {url} -> {path}'.format(url=image_url, path=filepath))

    if filename:
        recipe.write('![{filename}]("/images/recipe/{filename}")\n'.format(filename=filename))
    recipe.write('\n')
    recipe.write('## Information\n')
    recipe.write('Yields: {yields}\n'.format(yields=scraper.yields()))
    recipe.write('Total Time: {total_time}\n'.format(total_time=scraper.total_time()))
    recipe.write('\n')
    recipe.write('## Ingredients\n')
    for ingredient in scraper.ingredients():
        recipe.write('1. {ingredient}\n'.format(ingredient=ingredient))

    recipe.write('\n')
    recipe.write('## Instructions\n')
    for instruction in scraper.instructions().split('\n'):
        instruction = instruction.strip()
        if instruction:
            if instruction[0].isdigit():
                recipe.write('{instruction}\n'.format(instruction=instruction))
            else:
                recipe.write('1. {instruction}\n'.format(instruction=instruction))

    recipe.write('\n#### URL\n')
    recipe.write('[{url}]({url})\n'.format(url=url))
    recipe.close()
    # if verbose:
    print('Saving {url} -> {path}'.format(url=url, path=path))


def main():
    """ Console script entry point.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('url', metavar='URL', type=str, nargs='*', default='', help='recipe url')
    parser.add_argument('-l', dest='list', action='store_true', default=False, help='list all available sites')
    parser.add_argument('-w', dest='wild_mode', action='store_true', default=False, help="try scraping 'unknown' site using wild-mode (some editing of the recipe might be required)")
    parser.add_argument('-v', dest='verbose', action='store_true', default=False, help='verbose output')
    args = parser.parse_args()

    if args.list:
        for host in sorted(SCRAPERS):
            print(host)
        sys.exit()

    wild_mode = args.wild_mode
    verbose = args.verbose

    config_path = ensure_directory_exists(os.path.join(ROOT, 'recipe_box.json'), file=True)
    if not os.path.exists(config_path):
        config = {'recipe_box': '~/recipe_box/'}
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=4)
    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

    for url in args.url:
        if url:
            try:
                scraper = scrape_me(url, wild_mode=wild_mode)
            except WebsiteNotImplementedError:
                print('No scraper defined for {url}'.format(url=url))
                print('Try using the -w [wild-mode] option, your mileage may vary.')
                print('')
                print('It is recommended you add it to recipe-scrapers site, that way everybody gains from the effort.')
                print('https://github.com/hhursev/recipe-scrapers#if-you-want-a-scraper-for-a-new-site-added')
                print('')
                print('Once someone has added the new scraper:')
                print('pip install --upgrade recipe-scrapers')
            else:
                process_recipe(config, scraper, url, verbose)


if __name__ == '__main__':
    main()
Implement recipe_box. 2020-06-26 19:36:31 -05:00			`#!/usr/bin/env python`

			`""" Scrape a recipe, convert it to Markdown and store it in a Zettelkasten.`

			`A free recipe-box.`

			`1. https://obsidian.md/`
			`2. https://www.ourstate.com/a-kitchens-riches/`
			`"""`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`import argparse`
Implement recipe_box. 2020-06-26 19:36:31 -05:00			`import json`
			`import os`
			`import sys`

Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`try:`
			`import httpx`
Fix httpx client instantiation. 2023-07-01 06:45:15 -05:00			`url_getter = httpx.Client(http2=True)`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`except ImportError:`
			`import requests`
			`url_getter = requests`

Add options to display supported sites, improve docs. 2020-06-27 07:12:03 -05:00			`from recipe_scrapers import scrape_me, WebsiteNotImplementedError, SCRAPERS`
Implement recipe_box. 2020-06-26 19:36:31 -05:00

Sane defaults. 2020-06-27 14:05:33 -05:00			`ROOT = '~/.config/recipe_box/'`
Implement recipe_box. 2020-06-26 19:36:31 -05:00

			`def ensure_directory_exists(path, expand_user=True, file=False):`
			`""" Create a directory if it doesn't exists.`

			`Expanding '~' to the user's home directory on POSIX systems.`
			`"""`
			`if expand_user:`
			`path = os.path.expanduser(path)`

			`if file:`
			`directory = os.path.dirname(path)`
			`else:`
			`directory = path`

			`if not os.path.exists(directory) and directory:`
			`try:`
			`os.makedirs(directory)`
			`except OSError as e:`
			`# A parallel process created the directory after the existence check.`
			`pass`

Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`return path`
Implement recipe_box. 2020-06-26 19:36:31 -05:00

			`def valid_filename(directory, filename=None, ascii=False):`
			`""" Return a valid "new" filename in a directory, given a filename/directory=path to test.`

			`Deal with duplicate filenames.`
			`"""`
			`def test_filename(filename, count):`
			`""" Filename to test for existence.`
			`"""`
			`fn, ext = os.path.splitext(filename)`
			`return fn + '({})'.format(count) + ext`

			`return_path = filename is None`

			`# Directory is a path.`
			`if filename is None:`
			`filename = os.path.basename(directory)`
			`directory = os.path.dirname(directory)`

			`# if ascii:`
			`# filename = unidecode(unicode(filename))`
			`# filename = ' '.join(filename.splitlines()).strip()`
			`# filename = filename.decode('ascii', 'ignore')`

			`# Allow for directories.`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`items = set(os.listdir(directory))`
Implement recipe_box. 2020-06-26 19:36:31 -05:00			`if filename in items:`
			`count = 1`
			`while test_filename(filename, count) in items:`
			`count += 1`
			`if return_path:`
			`return os.path.join(directory, test_filename(filename, count))`
			`return test_filename(filename, count)`
			`else:`
			`if return_path:`
			`return os.path.join(directory, filename)`
			`return filename`


Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`def process_recipe(config, scraper, url, verbose=False):`
			`""" Process the recipe at a given URL.`
Sane defaults. 2020-06-27 14:05:33 -05:00			`"""`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`recipe_box = ensure_directory_exists(config['recipe_box'])`
			`media = ensure_directory_exists(os.path.join(config['recipe_box'], 'media'))`

			`prefix = scraper.title().lower()`
			`path = os.path.join(recipe_box, prefix + '.md')`
			`path = valid_filename(path)`
			`recipe = open(path, 'w')`

			`try:`
			`image_url = scraper.image()`
			`response = url_getter.get(image_url)`
			`except:`
			`filename = None`
			`else:`
			`# Not sure about image urls without filename extensions, might need python-magic.`
			`# Also, os.path.splitext(url), probably not a good idea. ;)`
			`filename = os.path.splitext(os.path.basename(path))[0] + os.path.splitext(scraper.image())[1]`
			`filepath = os.path.join(media, filename)`
			`image = open(filepath, 'wb')`
			`image.write(response.content)`
			`image.close()`
			`if verbose:`
			`print('Saving {url} -> {path}'.format(url=image_url, path=filepath))`

			`if filename:`
change: Change in formatting 2023-07-05 18:24:14 -05:00			`recipe.write('![{filename}]("/images/recipe/{filename}")\n'.format(filename=filename))`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`recipe.write('\n')`
change: Change in formatting 2023-07-05 18:24:14 -05:00			`recipe.write('## Information\n')`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`recipe.write('Yields: {yields}\n'.format(yields=scraper.yields()))`
			`recipe.write('Total Time: {total_time}\n'.format(total_time=scraper.total_time()))`
			`recipe.write('\n')`
			`recipe.write('## Ingredients\n')`
			`for ingredient in scraper.ingredients():`
change: Change in formatting 2023-07-05 18:24:14 -05:00			`recipe.write('1. {ingredient}\n'.format(ingredient=ingredient))`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00
			`recipe.write('\n')`
			`recipe.write('## Instructions\n')`
			`for instruction in scraper.instructions().split('\n'):`
			`instruction = instruction.strip()`
			`if instruction:`
			`if instruction[0].isdigit():`
			`recipe.write('{instruction}\n'.format(instruction=instruction))`
			`else:`
			`recipe.write('1. {instruction}\n'.format(instruction=instruction))`
Add options to display supported sites, improve docs. 2020-06-27 07:12:03 -05:00
change: Change in formatting 2023-07-05 18:24:14 -05:00			`recipe.write('\n#### URL\n')`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`recipe.write('[{url}]({url})\n'.format(url=url))`
			`recipe.close()`
			`# if verbose:`
			`print('Saving {url} -> {path}'.format(url=url, path=path))`
Add options to display supported sites, improve docs. 2020-06-27 07:12:03 -05:00
Implement recipe_box. 2020-06-26 19:36:31 -05:00
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`def main():`
			`""" Console script entry point.`
			`"""`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('url', metavar='URL', type=str, nargs='*', default='', help='recipe url')`
			`parser.add_argument('-l', dest='list', action='store_true', default=False, help='list all available sites')`
			`parser.add_argument('-w', dest='wild_mode', action='store_true', default=False, help="try scraping 'unknown' site using wild-mode (some editing of the recipe might be required)")`
			`parser.add_argument('-v', dest='verbose', action='store_true', default=False, help='verbose output')`
			`args = parser.parse_args()`

			`if args.list:`
Add options to display supported sites, improve docs. 2020-06-27 07:12:03 -05:00			`for host in sorted(SCRAPERS):`
			`print(host)`
			`sys.exit()`

Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`wild_mode = args.wild_mode`
			`verbose = args.verbose`

Sane defaults. 2020-06-27 14:05:33 -05:00			`config_path = ensure_directory_exists(os.path.join(ROOT, 'recipe_box.json'), file=True)`
Implement recipe_box. 2020-06-26 19:36:31 -05:00			`if not os.path.exists(config_path):`
			`config = {'recipe_box': '~/recipe_box/'}`
			`with open(config_path, 'w') as f:`
Sane defaults. 2020-06-27 14:05:33 -05:00			`json.dump(config, f, indent=4)`
Implement recipe_box. 2020-06-26 19:36:31 -05:00			`else:`
			`with open(config_path, 'r') as f:`
			`config = json.load(f)`

Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`for url in args.url:`
			`if url:`
Implement recipe_box. 2020-06-26 19:36:31 -05:00			`try:`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`scraper = scrape_me(url, wild_mode=wild_mode)`
			`except WebsiteNotImplementedError:`
			`print('No scraper defined for {url}'.format(url=url))`
			`print('Try using the -w [wild-mode] option, your mileage may vary.')`
			`print('')`
			`print('It is recommended you add it to recipe-scrapers site, that way everybody gains from the effort.')`
			`print('https://github.com/hhursev/recipe-scrapers#if-you-want-a-scraper-for-a-new-site-added')`
			`print('')`
			`print('Once someone has added the new scraper:')`
			`print('pip install --upgrade recipe-scrapers')`
Implement recipe_box. 2020-06-26 19:36:31 -05:00			`else:`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00			`process_recipe(config, scraper, url, verbose)`
Sane defaults. 2020-06-27 14:05:33 -05:00

			`if __name__ == '__main__':`
			`main()`
Merge Sayantan Santra (SinTan1729)'s improvements. 2023-07-01 06:36:55 -05:00