Compare commits
3 Commits
cbb56871e8
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
12c4dcf33b | ||
|
|
8ff6022d8c | ||
|
|
80eab4e9bd |
112
function.py
112
function.py
@@ -1,6 +1,9 @@
|
||||
from os import times
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import re
|
||||
import time
|
||||
|
||||
from dns.name import empty
|
||||
|
||||
@@ -15,11 +18,14 @@ def try_request(link, max_retries=5):
|
||||
return response
|
||||
else:
|
||||
retries += 1
|
||||
time.sleep(2)
|
||||
except:
|
||||
retries += 1
|
||||
time.sleep(2)
|
||||
|
||||
def try_soup(response):
|
||||
try: return bs(response.text, 'html.parser')
|
||||
try:
|
||||
return bs(response.text, 'html.parser')
|
||||
except:
|
||||
print('404')
|
||||
return False
|
||||
@@ -41,30 +47,33 @@ def extract_nutrition(calories_info):
|
||||
|
||||
return dict(zip(['calories', 'proteins', 'fats', 'carbs'], numbers))
|
||||
except:
|
||||
return print('БЖУ не найдены')
|
||||
return None
|
||||
|
||||
def extract_tags_from_detailed_tags(main_container):
|
||||
try:
|
||||
|
||||
detailed_tags = main_container.find(class_='detailed_tags')
|
||||
tags = {}
|
||||
detailed_tags = main_container.find(class_='detailed_tags')
|
||||
tags = {}
|
||||
|
||||
for span_b in detailed_tags.find_all('span', class_='b'):
|
||||
label = span_b.get_text(strip=True).rstrip(':')
|
||||
next_span = span_b.find_next_sibling('span')
|
||||
for span_b in detailed_tags.find_all('span', class_='b'):
|
||||
label = span_b.get_text(strip=True).rstrip(':')
|
||||
next_span = span_b.find_next_sibling('span')
|
||||
|
||||
if next_span:
|
||||
tag_list = [a.get_text(strip=True) for a in next_span.find_all('a')]
|
||||
else:
|
||||
tag_list = []
|
||||
if next_span:
|
||||
tag_list = [a.get_text(strip=True) for a in next_span.find_all('a')]
|
||||
else:
|
||||
tag_list = []
|
||||
|
||||
if label == 'Назначение': label = 'occasion'
|
||||
elif label == 'Основной ингредиент': continue
|
||||
elif label == 'Блюдо': label = 'type_dish'
|
||||
elif label == 'География кухни': label = 'cuisine'
|
||||
if label == 'Назначение': label = 'occasion'
|
||||
elif label == 'Основной ингредиент': continue
|
||||
elif label == 'Блюдо': label = 'type_dish'
|
||||
elif label == 'География кухни': label = 'cuisine'
|
||||
|
||||
tags[label] = tag_list
|
||||
tags[label] = tag_list
|
||||
|
||||
return tags
|
||||
return tags
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
def try_extr_ingredient(span_b, class_, portions=1):
|
||||
@@ -79,53 +88,60 @@ def try_extr_ingredient(span_b, class_, portions=1):
|
||||
|
||||
|
||||
def extr_ingredient(main_container):
|
||||
#Сбор ингредиентов
|
||||
portions = int(main_container.find(class_='yield value').get_text(strip=True))
|
||||
try:
|
||||
#Сбор ингредиентов
|
||||
portions = int(main_container.find(class_='yield value').get_text(strip=True))
|
||||
|
||||
tags = {}
|
||||
tags = {}
|
||||
|
||||
for span_b in main_container.find_all(class_='ingredient flex-dot-line'):
|
||||
for span_b in main_container.find_all(class_='ingredient flex-dot-line'):
|
||||
|
||||
label = try_extr_ingredient(span_b, class_='name')
|
||||
value_ingredient = try_extr_ingredient(span_b, 'value', portions)
|
||||
unit_name = try_extr_ingredient(span_b, 'u-unit-name')
|
||||
label = try_extr_ingredient(span_b, class_='name')
|
||||
value_ingredient = try_extr_ingredient(span_b, 'value', portions)
|
||||
unit_name = try_extr_ingredient(span_b, 'u-unit-name')
|
||||
|
||||
#print(label, value_ingredient, unit_name)
|
||||
#print(label, value_ingredient, unit_name)
|
||||
|
||||
tags[label] = {'unit':unit_name, 'amount':value_ingredient}
|
||||
tags[label] = {'unit':unit_name, 'amount':value_ingredient}
|
||||
|
||||
return tags
|
||||
return tags
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
def extr_steps(main_container):
|
||||
# На сайте есть страницы исключения по шагам готовки. Фото есть не везде, тогда ищем класс detailed_step_description_big noPhotoStep
|
||||
# Класс detailed_step_description_big noPhotoStep ищет текст через get_text(), а не через тег title
|
||||
|
||||
steps = []
|
||||
count = 1
|
||||
try:
|
||||
# На сайте есть страницы исключения по шагам готовки. Фото есть не везде, тогда ищем класс detailed_step_description_big noPhotoStep
|
||||
# Класс detailed_step_description_big noPhotoStep ищет текст через get_text(), а не через тег title
|
||||
|
||||
recipeInstructions = main_container.find(class_='instructions')
|
||||
steps = []
|
||||
count = 1
|
||||
|
||||
main_container = recipeInstructions.find_all(class_='stepphotos')
|
||||
recipeInstructions = main_container.find(class_='instructions')
|
||||
|
||||
# Проверяем страницу исключение
|
||||
if not main_container:
|
||||
main_container = recipeInstructions.find_all(class_='detailed_step_description_big noPhotoStep')
|
||||
main_container = recipeInstructions.find_all(class_='stepphotos')
|
||||
|
||||
# Проверяем страницу исключение
|
||||
if not main_container:
|
||||
main_container = recipeInstructions.find_all(class_='detailed_step_description_big noPhotoStep')
|
||||
|
||||
|
||||
for items in main_container:
|
||||
img = items.get('href')
|
||||
title = items.get('title')
|
||||
for items in main_container:
|
||||
img = items.get('href')
|
||||
title = items.get('title')
|
||||
|
||||
# Если класс detailed_step_description_big noPhotoStep, то ищем через get_text. Сейчас title пустой, тк его нет на странице
|
||||
if title is None:
|
||||
title = items.get_text() #Теперь тайтл заполнен
|
||||
print(title)
|
||||
# Если класс detailed_step_description_big noPhotoStep, то ищем через get_text. Сейчас title пустой, тк его нет на странице
|
||||
if title is None:
|
||||
title = items.get_text() #Теперь тайтл заполнен
|
||||
|
||||
steps.append({
|
||||
'img': img,
|
||||
'title': title
|
||||
})
|
||||
return steps
|
||||
steps.append({
|
||||
'img': img,
|
||||
'title': title
|
||||
})
|
||||
|
||||
return steps
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
from pymongo import MongoClient
|
||||
import log_err as lg
|
||||
|
||||
from pymongo import MongoClient, ReplaceOne
|
||||
|
||||
def connect_to_mongo():
|
||||
"""Подключение к MongoDB"""
|
||||
@@ -7,11 +10,35 @@ def connect_to_mongo():
|
||||
return db["Test"]
|
||||
|
||||
|
||||
def import_json_in_mongo(data):
|
||||
def import_json_in_mongo(recipe_data):
|
||||
"""Сохраняет один рецепт (для обратной совместимости и ошибок)"""
|
||||
collection = connect_to_mongo()
|
||||
collection.insert_one(data)
|
||||
|
||||
|
||||
try:
|
||||
collection.replace_one({"_id": recipe_data["_id"]}, recipe_data, upsert=True)
|
||||
print(f"✅ Рецепт '{recipe_data.get('recipe_name', recipe_data['_id'])}' успешно сохранён.")
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка при сохранении рецепта {recipe_data.get('_id')}: {e}")
|
||||
url = recipe_data.get('url', 'unknown')
|
||||
lg.log_error(url, str(e), 'Buff_err.json')
|
||||
|
||||
|
||||
def bulk_write_recipes(recipes_list):
|
||||
"""Сохраняет список рецептов массово с помощью bulk_write"""
|
||||
if not recipes_list:
|
||||
return
|
||||
|
||||
collection = connect_to_mongo()
|
||||
try:
|
||||
requests = [
|
||||
ReplaceOne({"_id": r["_id"]}, r, upsert=True)
|
||||
for r in recipes_list
|
||||
]
|
||||
result = collection.bulk_write(requests, ordered=False)
|
||||
print(f"✅ Bulk-запись: {len(recipes_list)} рецептов "
|
||||
f"(upserted: {result.upserted_count}, modified: {result.modified_count})")
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка при bulk-записи: {e}")
|
||||
# Опционально: можно сохранить recipes_list в файл для повторной обработки
|
||||
|
||||
|
||||
|
||||
|
||||
39
log_err.py
Normal file
39
log_err.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
def log_error(url, error, filename='Err.json'):
|
||||
"""
|
||||
Добавляет ошибку в список ошибок в JSON-файле.
|
||||
|
||||
Args:
|
||||
url (str): URL, на котором произошла ошибка.
|
||||
error (str or Exception): Описание ошибки.
|
||||
filename (str): Имя файла для логирования (по умолчанию 'Err.json').
|
||||
"""
|
||||
# Приводим error к строке, чтобы избежать проблем с сериализацией
|
||||
error_str = str(error)
|
||||
|
||||
# Загружаем существующие ошибки (если файл существует)
|
||||
if os.path.exists(filename):
|
||||
try:
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
errors = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
errors = []
|
||||
else:
|
||||
errors = []
|
||||
|
||||
# Добавляем новую ошибку
|
||||
errors.append({
|
||||
"url": url.strip(),
|
||||
"error": error_str
|
||||
})
|
||||
|
||||
# Сохраняем обратно
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(errors, f, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
167
parser.py
167
parser.py
@@ -1,120 +1,101 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import re
|
||||
import function as f
|
||||
import json
|
||||
import os
|
||||
import log_err as lg
|
||||
|
||||
import import_in_BD as ib
|
||||
|
||||
link = 'https://povar.ru/list/'
|
||||
buffer = []
|
||||
|
||||
def buffer_import():
|
||||
global buffer
|
||||
|
||||
|
||||
def pars_group(link):
|
||||
#Сбор видов блюд
|
||||
|
||||
|
||||
response = f.try_request(link)
|
||||
soup = bs(response.text, 'html.parser')
|
||||
|
||||
main_container = soup.find_all(class_='ingredientItem')
|
||||
|
||||
|
||||
for items in main_container:
|
||||
|
||||
item = items.find_all('a')
|
||||
title = item[0].get_text()
|
||||
|
||||
if title == 'Салаты': break
|
||||
|
||||
print(title)
|
||||
|
||||
for i in item[1::]:
|
||||
name_group = i.get_text()
|
||||
link_group = 'https://povar.ru' + i.get('href')
|
||||
print('-'*5, name_group, link_group)
|
||||
|
||||
pars_dishs(title, name_group, link_group)
|
||||
|
||||
|
||||
|
||||
print('-'*50)
|
||||
|
||||
|
||||
def pars_dishs(title='', name_group='', link='https://povar.ru/list/spagetti/', page=0):
|
||||
#Сбор списка рецептов
|
||||
while True:
|
||||
page += 1
|
||||
new_link = link + str(page)
|
||||
soup = f.try_soup(f.try_request(new_link))
|
||||
|
||||
if soup == False: break
|
||||
|
||||
main_container = soup.find_all(class_='listRecipieTitle')
|
||||
|
||||
for items in main_container:
|
||||
recipe_name = items.get_text()
|
||||
recipe_link = 'https://povar.ru' + items.get('href')
|
||||
|
||||
print(recipe_name, recipe_link)
|
||||
pars_recipie(title, name_group, recipe_name, recipe_link)
|
||||
|
||||
print('-'*50)
|
||||
|
||||
|
||||
|
||||
def pars_recipie(title=0, name_group=0, recipe_name=0 ,link='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
|
||||
|
||||
response = f.try_request(link)
|
||||
soup = bs(response.text, 'html.parser')
|
||||
|
||||
main_container = soup.find(class_='cont_area hrecipe')
|
||||
|
||||
name_id = link.split('/')[-1]
|
||||
try:
|
||||
name_id = name_id.replace('.html', '')
|
||||
except: pass
|
||||
|
||||
print(name_id)
|
||||
|
||||
photo = main_container.find(class_='photo').get('src')
|
||||
|
||||
recipies = {'recipes': {}}
|
||||
|
||||
detailed_tags = f.extract_tags_from_detailed_tags(main_container) #Собираем теги
|
||||
#print(detailed_tags)
|
||||
|
||||
ingredients = f.extr_ingredient(main_container) #Собираем ингредиенты
|
||||
#print(ingredients)
|
||||
|
||||
calories_info = f.extract_nutrition(main_container.find_all(class_='circle')) #БЖУ
|
||||
#print(calories_info)
|
||||
|
||||
steps = f.extr_steps(main_container) #Сборка шагов
|
||||
#print(steps)
|
||||
ib.bulk_write_recipes(buffer)
|
||||
buffer.clear()
|
||||
except Exception as bulk_e:
|
||||
print(f"⚠️ Bulk-ошибка: {bulk_e}. Переключаемся на поштучную запись...")
|
||||
for recipe in buffer:
|
||||
try:
|
||||
ib.import_json_in_mongo(recipe) # ← ОДИН рецепт
|
||||
except Exception as single_e:
|
||||
lg.log_error(recipe.get('url', 'unknown'), f"MongoDB-фейл: {single_e}")
|
||||
buffer.clear()
|
||||
|
||||
|
||||
recip = {'_id' : name_id,
|
||||
|
||||
|
||||
|
||||
def open_url():
|
||||
total_type_recip = json.load(open('unique_urls.json', 'r', encoding='utf-8'))
|
||||
|
||||
for url in total_type_recip:
|
||||
if len(buffer) >= 200:
|
||||
print('❗️ Сейвим', len(buffer))
|
||||
print(buffer)
|
||||
buffer_import()
|
||||
|
||||
pars_recipie(url)
|
||||
|
||||
buffer_import()
|
||||
|
||||
|
||||
def pars_recipie(url='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
|
||||
|
||||
try:
|
||||
|
||||
response = f.try_request(url)
|
||||
soup = bs(response.text, 'html.parser')
|
||||
|
||||
recipe_name = soup.find(class_='detailed fn').text
|
||||
|
||||
main_container = soup.find(class_='cont_area hrecipe')
|
||||
|
||||
|
||||
steps = f.extr_steps(main_container) #Сборка шагов
|
||||
|
||||
if steps is None:
|
||||
lg.log_error(url, 'Нет шагов')
|
||||
return None
|
||||
|
||||
name_id = url.split('/')[-1].strip()
|
||||
try:
|
||||
name_id = name_id.replace('.html', '').strip()
|
||||
except: pass
|
||||
|
||||
photo = main_container.find(class_='photo').get('src')
|
||||
|
||||
|
||||
detailed_tags = f.extract_tags_from_detailed_tags(main_container) #Собираем теги
|
||||
ingredients = f.extr_ingredient(main_container) #Собираем ингредиенты
|
||||
calories_info = f.extract_nutrition(main_container.find_all(class_='circle')) #БЖУ
|
||||
|
||||
|
||||
recip = {'_id' : name_id,
|
||||
'recipe_name':recipe_name,
|
||||
'url':link,
|
||||
'url':url,
|
||||
'preview_img':photo,
|
||||
'tags':detailed_tags,
|
||||
'ingredients':ingredients,
|
||||
'nutritional_value':calories_info,
|
||||
'steps':steps}
|
||||
|
||||
print(recip)
|
||||
print(len(steps))
|
||||
print('⭕',recipe_name)
|
||||
print('🤍',recip)
|
||||
|
||||
#ib.import_json_in_mongo(recipies)
|
||||
buffer.append(recip)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(url, e)
|
||||
lg.log_error(url, e)
|
||||
|
||||
#pars_group(link)
|
||||
#pars_dishs()
|
||||
pars_recipie(link="https://povar.ru/recipes/podjarka_k_makaronam-60879.html")
|
||||
|
||||
|
||||
|
||||
#pars_recipie()
|
||||
open_url()
|
||||
|
||||
|
||||
|
||||
|
||||
85021
unique_urls.json
Normal file
85021
unique_urls.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user