Files
Parser_recipes/function.py
zein 12c4dcf33b Финальная сборка.
Собраны все ссылки, удалены дубликаты. Осталось 85к ссылок. Парсим рецепты
2025-11-24 03:24:08 +03:00

170 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from os import times
import requests
from bs4 import BeautifulSoup as bs
import re
import time
from dns.name import empty
def try_request(link, max_retries=5):
retries = 0
while retries < max_retries:
try:
response = requests.get(link)
if response.status_code == 200:
return response
else:
retries += 1
time.sleep(2)
except:
retries += 1
time.sleep(2)
def try_soup(response):
try:
return bs(response.text, 'html.parser')
except:
print('404')
return False
def extract_nutrition(calories_info):
# Собираем БЖУ
numbers = []
try:
for item in calories_info:
text = item.get_text()
match = re.search(r'\d+', text)
if match:
numbers.append(int(match.group()))
if len(numbers) != 4:
raise ValueError(f"Ожидалось 4 числа, найдено: {len(numbers)}")
return dict(zip(['calories', 'proteins', 'fats', 'carbs'], numbers))
except:
return None
def extract_tags_from_detailed_tags(main_container):
try:
detailed_tags = main_container.find(class_='detailed_tags')
tags = {}
for span_b in detailed_tags.find_all('span', class_='b'):
label = span_b.get_text(strip=True).rstrip(':')
next_span = span_b.find_next_sibling('span')
if next_span:
tag_list = [a.get_text(strip=True) for a in next_span.find_all('a')]
else:
tag_list = []
if label == 'Назначение': label = 'occasion'
elif label == 'Основной ингредиент': continue
elif label == 'Блюдо': label = 'type_dish'
elif label == 'География кухни': label = 'cuisine'
tags[label] = tag_list
return tags
except Exception as e:
print(e)
def try_extr_ingredient(span_b, class_, portions=1):
try:
item = span_b.find(class_=class_).get_text(strip=True)
try: item = float(item)/portions
except ValueError: pass
return item
except AttributeError: return None
def extr_ingredient(main_container):
try:
#Сбор ингредиентов
portions = int(main_container.find(class_='yield value').get_text(strip=True))
tags = {}
for span_b in main_container.find_all(class_='ingredient flex-dot-line'):
label = try_extr_ingredient(span_b, class_='name')
value_ingredient = try_extr_ingredient(span_b, 'value', portions)
unit_name = try_extr_ingredient(span_b, 'u-unit-name')
#print(label, value_ingredient, unit_name)
tags[label] = {'unit':unit_name, 'amount':value_ingredient}
return tags
except Exception as e:
print(e)
def extr_steps(main_container):
try:
# На сайте есть страницы исключения по шагам готовки. Фото есть не везде, тогда ищем класс detailed_step_description_big noPhotoStep
# Класс detailed_step_description_big noPhotoStep ищет текст через get_text(), а не через тег title
steps = []
count = 1
recipeInstructions = main_container.find(class_='instructions')
main_container = recipeInstructions.find_all(class_='stepphotos')
# Проверяем страницу исключение
if not main_container:
main_container = recipeInstructions.find_all(class_='detailed_step_description_big noPhotoStep')
for items in main_container:
img = items.get('href')
title = items.get('title')
# Если класс detailed_step_description_big noPhotoStep, то ищем через get_text. Сейчас title пустой, тк его нет на странице
if title is None:
title = items.get_text() #Теперь тайтл заполнен
steps.append({
'img': img,
'title': title
})
return steps
except Exception as e:
return None