Финальная сборка.

Собраны все ссылки, удалены дубликаты. Осталось 85к ссылок. Парсим рецепты
2025-11-24 03:24:08 +03:00
5 changed files with 85221 additions and 191 deletions
--- a/function.py
+++ b/function.py
@@ -1,6 +1,9 @@
 from os import times
 import requests
 from bs4 import BeautifulSoup as bs
 import re
 import time
 from dns.name import empty
@@ -15,8 +18,10 @@ def try_request(link, max_retries=5):
                return response
            else:
                retries += 1
                time.sleep(2)
        except:
            retries += 1
            time.sleep(2)
 def try_soup(response):
    try:
@@ -42,9 +47,10 @@ def extract_nutrition(calories_info):
        return dict(zip(['calories', 'proteins', 'fats', 'carbs'], numbers))
    except:
-        return print('БЖУ не найдены')
+        return None
 def extract_tags_from_detailed_tags(main_container):
    try:
        detailed_tags = main_container.find(class_='detailed_tags')
        tags = {}
@@ -66,6 +72,8 @@ def extract_tags_from_detailed_tags(main_container):
            tags[label] = tag_list
        return tags
    except Exception as e:
        print(e)
 def try_extr_ingredient(span_b, class_, portions=1):
@@ -80,6 +88,7 @@ def try_extr_ingredient(span_b, class_, portions=1):
 def extr_ingredient(main_container):
    try:
        #Сбор ингредиентов
        portions = int(main_container.find(class_='yield value').get_text(strip=True))
@@ -96,8 +105,12 @@ def extr_ingredient(main_container):
            tags[label] = {'unit':unit_name, 'amount':value_ingredient}
        return tags
    except Exception as e:
        print(e)
 def extr_steps(main_container):
    try:
        # На сайте есть страницы исключения по шагам готовки. Фото есть не везде, тогда ищем класс detailed_step_description_big noPhotoStep
        # Класс detailed_step_description_big noPhotoStep ищет текст через get_text(), а не через тег title
@@ -125,7 +138,10 @@ def extr_steps(main_container):
                'img': img,
                'title': title
            })
        return steps
    except Exception as e:
        return None
--- a/import_in_BD.py
+++ b/import_in_BD.py
@@ -1,4 +1,7 @@
 from pymongo import MongoClient
 import log_err as lg
 from pymongo import MongoClient, ReplaceOne
 def connect_to_mongo():
    """Подключение к MongoDB"""
@@ -8,16 +11,34 @@ def connect_to_mongo():
 def import_json_in_mongo(recipe_data):
    """Сохраняет один рецепт (для обратной совместимости и ошибок)"""
    collection = connect_to_mongo()
    print(recipe_data)
    try:
        collection.replace_one({"_id": recipe_data["_id"]}, recipe_data, upsert=True)
-        print(f"Рецепт '{recipe_data.get('recipe_name', recipe_data['_id'])}' успешно сохранён.")
+        print(f"✅ Рецепт '{recipe_data.get('recipe_name', recipe_data['_id'])}' успешно сохранён.")
    except Exception as e:
-        print(f"Ошибка при сохранении рецепта {recipe_data.get('_id')}: {e}")
+        print(f"❌ Ошибка при сохранении рецепта {recipe_data.get('_id')}: {e}")
-
+        url = recipe_data.get('url', 'unknown')
        lg.log_error(url, str(e), 'Buff_err.json')
 def bulk_write_recipes(recipes_list):
    """Сохраняет список рецептов массово с помощью bulk_write"""
    if not recipes_list:
        return
    collection = connect_to_mongo()
    try:
        requests = [
            ReplaceOne({"_id": r["_id"]}, r, upsert=True)
            for r in recipes_list
        ]
        result = collection.bulk_write(requests, ordered=False)
        print(f"✅ Bulk-запись: {len(recipes_list)} рецептов "
              f"(upserted: {result.upserted_count}, modified: {result.modified_count})")
    except Exception as e:
        print(f"❌ Ошибка при bulk-записи: {e}")
        # Опционально: можно сохранить recipes_list в файл для повторной обработки
--- a/log_err.py
+++ b/log_err.py
@@ -0,0 +1,39 @@
 import json
 import os
 def log_error(url, error, filename='Err.json'):
    """
    Добавляет ошибку в список ошибок в JSON-файле.
    Args:
        url (str): URL, на котором произошла ошибка.
        error (str or Exception): Описание ошибки.
        filename (str): Имя файла для логирования (по умолчанию 'Err.json').
    """
    # Приводим error к строке, чтобы избежать проблем с сериализацией
    error_str = str(error)
    # Загружаем существующие ошибки (если файл существует)
    if os.path.exists(filename):
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                errors = json.load(f)
        except (json.JSONDecodeError, OSError):
            errors = []
    else:
        errors = []
    # Добавляем новую ошибку
    errors.append({
        "url": url.strip(),
        "error": error_str
    })
    # Сохраняем обратно
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(errors, f, ensure_ascii=False, indent=4)
--- a/parser.py
+++ b/parser.py
@@ -3,166 +3,99 @@ from bs4 import BeautifulSoup as bs
 import function as f
 import json
 import os
 import log_err as lg
 import import_in_BD as ib
 link = 'https://povar.ru/list/'
 buffer = []
 def buffer_import():
    global buffer
 total_type_recip = {}
 def save_to_json(new_data, filename='total_type_recip.json'):
    # Загружаем существующие данные, если файл существует
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
    try:
-                existing_data = json.load(f)
+        ib.bulk_write_recipes(buffer)
-            except json.JSONDecodeError:
+        buffer.clear()
-                existing_data = {}
+    except Exception as bulk_e:
-    else:
+        print(f"⚠️ Bulk-ошибка: {bulk_e}. Переключаемся на поштучную запись...")
-        existing_data = {}
+        for recipe in buffer:
-
+            try:
-    # Сливаем new_data в existing_data
+                ib.import_json_in_mongo(recipe)  # ← ОДИН рецепт
-    for category, groups in new_data.items():
+            except Exception as single_e:
-        if category not in existing_data:
+                lg.log_error(recipe.get('url', 'unknown'), f"MongoDB-фейл: {single_e}")
-            existing_data[category] = {}
+        buffer.clear()
        for group, recipes in groups.items():
            # Перезаписываем только если ещё не было или чтобы не дублировать — можно использовать set позже
            existing_data[category][group] = recipes
    # Сохраняем обратно
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=4)
 def pars_group(link):
    #Сбор видов блюд
-    response = f.try_request(link)
+def open_url():
    total_type_recip = json.load(open('unique_urls.json', 'r', encoding='utf-8'))
    for url in total_type_recip:
        if len(buffer) >= 200:
            print('❗️ Сейвим', len(buffer))
            print(buffer)
            buffer_import()
        pars_recipie(url)
    buffer_import()
 def pars_recipie(url='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
    try:
        response = f.try_request(url)
        soup = bs(response.text, 'html.parser')
-    main_container = soup.find_all(class_='ingredientItem')
+        recipe_name = soup.find(class_='detailed fn').text
    for items in main_container:
        item = items.find_all('a')
        title = items.find_all('h2')
        title = title[0].get_text()
        #if title == 'Выпечка': break
        # Инициализируем категорию, если ещё не создана
        if title not in total_type_recip:
            total_type_recip[title] = {}
        print(title)
        for i in item[1::]:
            name_group = i.get_text()
            link_group = 'https://povar.ru' + i.get('href')
            print('-'*5, name_group, link_group)
            total_type_recip[title][name_group] = []
            pars_dishs(title, name_group, link_group)
        print('-'*50)
 def pars_dishs(title='', name_group='', link='https://povar.ru/list/spagetti/', page=0):
    global total_type_recip
    #Сбор списка рецептов
    recipes = []
    while True:
        page += 1
        new_link = link + str(page)
        soup = f.try_soup(f.try_request(new_link))
        if soup == False: break
        main_container = soup.find_all(class_='listRecipieTitle')
        for items in main_container:
            recipe_name = items.get_text()
            recipe_link = 'https://povar.ru' + items.get('href')
            print('-'*10,recipe_name, recipe_link)
            #pars_recipie(title, name_group, recipe_name, recipe_link)
            recipes.append({'name': recipe_name, 'url': recipe_link})
        print('-'*50)
    # После сбора всех страниц — записываем в глобальную структуру
    total_type_recip[title][name_group] = recipes
    # И сразу сохраняем ВЕСЬ словарь в JSON
    save_to_json(total_type_recip)
 def pars_recipie(title=0, name_group=0, recipe_name=0 ,link='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
    response = f.try_request(link)
    soup = bs(response.text, 'html.parser')
        main_container = soup.find(class_='cont_area hrecipe')
    name_id = link.split('/')[-1]
    try:
        name_id = name_id.replace('.html', '')
    except: pass
-    print(name_id)
+        steps = f.extr_steps(main_container) #Сборка шагов
        if steps is None:
            lg.log_error(url, 'Нет шагов')
            return None
        name_id = url.split('/')[-1].strip()
        try:
            name_id = name_id.replace('.html', '').strip()
        except: pass
        photo = main_container.find(class_='photo').get('src')
    recipies = {'recipes': {}}
        detailed_tags = f.extract_tags_from_detailed_tags(main_container) #Собираем теги
        ingredients = f.extr_ingredient(main_container) #Собираем ингредиенты
        calories_info = f.extract_nutrition(main_container.find_all(class_='circle')) #БЖУ
    steps = f.extr_steps(main_container) #Сборка шагов
        recip = {'_id' : name_id,
                'recipe_name':recipe_name,
-                'url':link,
+                'url':url,
                'preview_img':photo,
                'tags':detailed_tags,
                'ingredients':ingredients,
                'nutritional_value':calories_info,
                'steps':steps}
-    print('Шагов - ',len(steps))
+        print('⭕',recipe_name)
        print('🤍',recip)
-    #ib.import_json_in_mongo(recip)
+        buffer.append(recip)
    except Exception as e:
        print(url, e)
        lg.log_error(url, e)
 pars_group(link)
 #pars_dishs()
 #pars_recipie()
-
+open_url()
--- a/unique_urls.json
+++ b/unique_urls.json