Files
Parser_recipes/function.py

154 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import requests
from bs4 import BeautifulSoup as bs
import re
from dns.name import empty
def try_request(link, max_retries=5):
retries = 0
while retries < max_retries:
try:
response = requests.get(link)
if response.status_code == 200:
return response
else:
retries += 1
except:
retries += 1
def try_soup(response):
try:
return bs(response.text, 'html.parser')
except:
print('404')
return False
def extract_nutrition(calories_info):
# Собираем БЖУ
numbers = []
try:
for item in calories_info:
text = item.get_text()
match = re.search(r'\d+', text)
if match:
numbers.append(int(match.group()))
if len(numbers) != 4:
raise ValueError(f"Ожидалось 4 числа, найдено: {len(numbers)}")
return dict(zip(['calories', 'proteins', 'fats', 'carbs'], numbers))
except:
return print('БЖУ не найдены')
def extract_tags_from_detailed_tags(main_container):
detailed_tags = main_container.find(class_='detailed_tags')
tags = {}
for span_b in detailed_tags.find_all('span', class_='b'):
label = span_b.get_text(strip=True).rstrip(':')
next_span = span_b.find_next_sibling('span')
if next_span:
tag_list = [a.get_text(strip=True) for a in next_span.find_all('a')]
else:
tag_list = []
if label == 'Назначение': label = 'occasion'
elif label == 'Основной ингредиент': continue
elif label == 'Блюдо': label = 'type_dish'
elif label == 'География кухни': label = 'cuisine'
tags[label] = tag_list
return tags
def try_extr_ingredient(span_b, class_, portions=1):
try:
item = span_b.find(class_=class_).get_text(strip=True)
try: item = float(item)/portions
except ValueError: pass
return item
except AttributeError: return None
def extr_ingredient(main_container):
#Сбор ингредиентов
portions = int(main_container.find(class_='yield value').get_text(strip=True))
tags = {}
for span_b in main_container.find_all(class_='ingredient flex-dot-line'):
label = try_extr_ingredient(span_b, class_='name')
value_ingredient = try_extr_ingredient(span_b, 'value', portions)
unit_name = try_extr_ingredient(span_b, 'u-unit-name')
#print(label, value_ingredient, unit_name)
tags[label] = {'unit':unit_name, 'amount':value_ingredient}
return tags
def extr_steps(main_container):
# На сайте есть страницы исключения по шагам готовки. Фото есть не везде, тогда ищем класс detailed_step_description_big noPhotoStep
# Класс detailed_step_description_big noPhotoStep ищет текст через get_text(), а не через тег title
steps = []
count = 1
recipeInstructions = main_container.find(class_='instructions')
main_container = recipeInstructions.find_all(class_='stepphotos')
# Проверяем страницу исключение
if not main_container:
main_container = recipeInstructions.find_all(class_='detailed_step_description_big noPhotoStep')
for items in main_container:
img = items.get('href')
title = items.get('title')
# Если класс detailed_step_description_big noPhotoStep, то ищем через get_text. Сейчас title пустой, тк его нет на странице
if title is None:
title = items.get_text() #Теперь тайтл заполнен
steps.append({
'img': img,
'title': title
})
return steps