mirror of
https://github.com/SinTan1729/recipe-box-for-wikijs.git
synced 2024-12-26 10:08:37 -06:00
Merge Sayantan Santra (SinTan1729)'s improvements.
This commit is contained in:
parent
5b218043d0
commit
87360450ae
1 changed files with 93 additions and 69 deletions
162
recipe_box.py
162
recipe_box.py
|
@ -7,12 +7,18 @@
|
||||||
1. https://obsidian.md/
|
1. https://obsidian.md/
|
||||||
2. https://www.ourstate.com/a-kitchens-riches/
|
2. https://www.ourstate.com/a-kitchens-riches/
|
||||||
"""
|
"""
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import optparse
|
|
||||||
import os
|
import os
|
||||||
import requests
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
url_getter = httpx
|
||||||
|
except ImportError:
|
||||||
|
import requests
|
||||||
|
url_getter = requests
|
||||||
|
|
||||||
from recipe_scrapers import scrape_me, WebsiteNotImplementedError, SCRAPERS
|
from recipe_scrapers import scrape_me, WebsiteNotImplementedError, SCRAPERS
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +45,7 @@ def ensure_directory_exists(path, expand_user=True, file=False):
|
||||||
# A parallel process created the directory after the existence check.
|
# A parallel process created the directory after the existence check.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return(path)
|
return path
|
||||||
|
|
||||||
|
|
||||||
def valid_filename(directory, filename=None, ascii=False):
|
def valid_filename(directory, filename=None, ascii=False):
|
||||||
|
@ -66,7 +72,7 @@ def valid_filename(directory, filename=None, ascii=False):
|
||||||
# filename = filename.decode('ascii', 'ignore')
|
# filename = filename.decode('ascii', 'ignore')
|
||||||
|
|
||||||
# Allow for directories.
|
# Allow for directories.
|
||||||
items = {item: True for item in os.listdir(directory)}
|
items = set(os.listdir(directory))
|
||||||
if filename in items:
|
if filename in items:
|
||||||
count = 1
|
count = 1
|
||||||
while test_filename(filename, count) in items:
|
while test_filename(filename, count) in items:
|
||||||
|
@ -80,24 +86,83 @@ def valid_filename(directory, filename=None, ascii=False):
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
def process_recipe(config, scraper, url, verbose=False):
|
||||||
|
""" Process the recipe at a given URL.
|
||||||
|
"""
|
||||||
|
recipe_box = ensure_directory_exists(config['recipe_box'])
|
||||||
|
media = ensure_directory_exists(os.path.join(config['recipe_box'], 'media'))
|
||||||
|
|
||||||
|
prefix = scraper.title().lower()
|
||||||
|
path = os.path.join(recipe_box, prefix + '.md')
|
||||||
|
path = valid_filename(path)
|
||||||
|
recipe = open(path, 'w')
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_url = scraper.image()
|
||||||
|
response = url_getter.get(image_url)
|
||||||
|
except:
|
||||||
|
filename = None
|
||||||
|
else:
|
||||||
|
# Not sure about image urls without filename extensions, might need python-magic.
|
||||||
|
# Also, os.path.splitext(url), probably not a good idea. ;)
|
||||||
|
filename = os.path.splitext(os.path.basename(path))[0] + os.path.splitext(scraper.image())[1]
|
||||||
|
filepath = os.path.join(media, filename)
|
||||||
|
image = open(filepath, 'wb')
|
||||||
|
image.write(response.content)
|
||||||
|
image.close()
|
||||||
|
if verbose:
|
||||||
|
print('Saving {url} -> {path}'.format(url=image_url, path=filepath))
|
||||||
|
|
||||||
|
recipe.write('# {title}\n'.format(title=scraper.title()))
|
||||||
|
if filename:
|
||||||
|
recipe.write('![[{filename}]]\n'.format(filename=filename))
|
||||||
|
recipe.write('\n')
|
||||||
|
# This is a placeholder for the user's own notes about the recipe.
|
||||||
|
recipe.write('## Notes\n')
|
||||||
|
recipe.write('\n')
|
||||||
|
recipe.write('## Metadata\n')
|
||||||
|
recipe.write('Yields: {yields}\n'.format(yields=scraper.yields()))
|
||||||
|
recipe.write('Total Time: {total_time}\n'.format(total_time=scraper.total_time()))
|
||||||
|
recipe.write('\n')
|
||||||
|
recipe.write('## Ingredients\n')
|
||||||
|
for ingredient in scraper.ingredients():
|
||||||
|
recipe.write('* {ingredient}\n'.format(ingredient=ingredient))
|
||||||
|
|
||||||
|
recipe.write('\n')
|
||||||
|
recipe.write('## Instructions\n')
|
||||||
|
for instruction in scraper.instructions().split('\n'):
|
||||||
|
instruction = instruction.strip()
|
||||||
|
if instruction:
|
||||||
|
if instruction[0].isdigit():
|
||||||
|
recipe.write('{instruction}\n'.format(instruction=instruction))
|
||||||
|
else:
|
||||||
|
recipe.write('1. {instruction}\n'.format(instruction=instruction))
|
||||||
|
|
||||||
|
recipe.write('\n')
|
||||||
|
recipe.write('[{url}]({url})\n'.format(url=url))
|
||||||
|
recipe.close()
|
||||||
|
# if verbose:
|
||||||
|
print('Saving {url} -> {path}'.format(url=url, path=path))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
""" Console script entry point.
|
""" Console script entry point.
|
||||||
"""
|
"""
|
||||||
parser = optparse.OptionParser('%prog url')
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('url', metavar='URL', type=str, nargs='*', default='', help='recipe url')
|
||||||
|
parser.add_argument('-l', dest='list', action='store_true', default=False, help='list all available sites')
|
||||||
|
parser.add_argument('-w', dest='wild_mode', action='store_true', default=False, help="try scraping 'unknown' site using wild-mode (some editing of the recipe might be required)")
|
||||||
|
parser.add_argument('-v', dest='verbose', action='store_true', default=False, help='verbose output')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
parser.add_option('-l',
|
if args.list:
|
||||||
dest='list',
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help='list all available sites')
|
|
||||||
|
|
||||||
options, args = parser.parse_args()
|
|
||||||
|
|
||||||
if options.list:
|
|
||||||
for host in sorted(SCRAPERS):
|
for host in sorted(SCRAPERS):
|
||||||
print(host)
|
print(host)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
wild_mode = args.wild_mode
|
||||||
|
verbose = args.verbose
|
||||||
|
|
||||||
config_path = ensure_directory_exists(os.path.join(ROOT, 'recipe_box.json'), file=True)
|
config_path = ensure_directory_exists(os.path.join(ROOT, 'recipe_box.json'), file=True)
|
||||||
if not os.path.exists(config_path):
|
if not os.path.exists(config_path):
|
||||||
config = {'recipe_box': '~/recipe_box/'}
|
config = {'recipe_box': '~/recipe_box/'}
|
||||||
|
@ -107,64 +172,23 @@ def main():
|
||||||
with open(config_path, 'r') as f:
|
with open(config_path, 'r') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
|
||||||
for url in args:
|
for url in args.url:
|
||||||
try:
|
if url:
|
||||||
scraper = scrape_me(url)
|
|
||||||
except WebsiteNotImplementedError:
|
|
||||||
print('No scraper defined for {url}'.format(url=url))
|
|
||||||
print('It is recommended you add it to recipe-scrapers site, that way everybody gains from the effort.')
|
|
||||||
print('https://github.com/hhursev/recipe-scrapers#if-you-want-a-scraper-for-a-new-site-added')
|
|
||||||
print('')
|
|
||||||
print('Once someone has added the new scraper:')
|
|
||||||
print('pip install --upgrade recipe-scrapers')
|
|
||||||
else:
|
|
||||||
recipe_box = ensure_directory_exists(config['recipe_box'])
|
|
||||||
media = ensure_directory_exists(os.path.join(config['recipe_box'], 'media'))
|
|
||||||
|
|
||||||
prefix = scraper.title().lower()
|
|
||||||
path = os.path.join(recipe_box, prefix + '.md')
|
|
||||||
path = valid_filename(path)
|
|
||||||
recipe = open(path, 'w')
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.get(scraper.image())
|
scraper = scrape_me(url, wild_mode=wild_mode)
|
||||||
except:
|
except WebsiteNotImplementedError:
|
||||||
filename = None
|
print('No scraper defined for {url}'.format(url=url))
|
||||||
|
print('Try using the -w [wild-mode] option, your mileage may vary.')
|
||||||
|
print('')
|
||||||
|
print('It is recommended you add it to recipe-scrapers site, that way everybody gains from the effort.')
|
||||||
|
print('https://github.com/hhursev/recipe-scrapers#if-you-want-a-scraper-for-a-new-site-added')
|
||||||
|
print('')
|
||||||
|
print('Once someone has added the new scraper:')
|
||||||
|
print('pip install --upgrade recipe-scrapers')
|
||||||
else:
|
else:
|
||||||
# Not sure about image urls without filename extensions, might need python-magic.
|
process_recipe(config, scraper, url, verbose)
|
||||||
# Also, os.path.splitext(url), probably not a good idea. ;)
|
|
||||||
filename = os.path.splitext(os.path.basename(path))[0] + os.path.splitext(scraper.image())[1]
|
|
||||||
image = open(os.path.join(media, filename), 'wb')
|
|
||||||
image.write(response.content)
|
|
||||||
image.close()
|
|
||||||
|
|
||||||
recipe.write('# {title}\n'.format(title=scraper.title()))
|
|
||||||
if filename:
|
|
||||||
recipe.write('![[{filename}]]\n'.format(filename=filename))
|
|
||||||
recipe.write('\n')
|
|
||||||
recipe.write('## Notes\n')
|
|
||||||
recipe.write('\n')
|
|
||||||
recipe.write('## Metadata\n')
|
|
||||||
recipe.write('Yields: {yields}\n'.format(yields=scraper.yields()))
|
|
||||||
recipe.write('Total Time: {total_time}\n'.format(total_time=scraper.total_time()))
|
|
||||||
recipe.write('\n')
|
|
||||||
recipe.write('## Ingredients\n')
|
|
||||||
for ingredient in scraper.ingredients():
|
|
||||||
recipe.write('* {ingredient}\n'.format(ingredient=ingredient))
|
|
||||||
|
|
||||||
recipe.write('\n')
|
|
||||||
recipe.write('## Instructions\n')
|
|
||||||
for instruction in scraper.instructions().split('\n'):
|
|
||||||
instruction = instruction.strip()
|
|
||||||
if instruction:
|
|
||||||
if instruction[0].isdigit():
|
|
||||||
recipe.write('{instruction}\n'.format(instruction=instruction))
|
|
||||||
else:
|
|
||||||
recipe.write('1. {instruction}\n'.format(instruction=instruction))
|
|
||||||
|
|
||||||
recipe.write('\n')
|
|
||||||
recipe.write('[{url}]({url})\n'.format(url=url))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue