Финальная сборка.

Собраны все ссылки, удалены дубликаты. Осталось 85к ссылок. Парсим рецепты
Сбор и сохранение рецептом и ссылок в json
2025-11-24 03:24:08 +03:00 · 2025-11-23 04:10:34 +03:00 · 2025-11-23 04:09:34 +03:00
5 changed files with 85229 additions and 145 deletions
--- a/function.py
+++ b/function.py
@@ -1,6 +1,9 @@
+from os import times
+
 import requests
 from bs4 import BeautifulSoup as bs
 import re
+import time

 from dns.name import empty

@@ -15,11 +18,14 @@ def try_request(link, max_retries=5):
                return response
            else:
                retries += 1
+                time.sleep(2)
        except:
            retries += 1
+            time.sleep(2)

 def try_soup(response):
-    try: return bs(response.text, 'html.parser')
+    try:
+        return bs(response.text, 'html.parser')
    except:
        print('404')
        return False
@@ -41,30 +47,33 @@ def extract_nutrition(calories_info):

        return dict(zip(['calories', 'proteins', 'fats', 'carbs'], numbers))
    except:
-        return print('БЖУ не найдены')
+        return None

 def extract_tags_from_detailed_tags(main_container):
+    try:

-    detailed_tags = main_container.find(class_='detailed_tags')
-    tags = {}
+        detailed_tags = main_container.find(class_='detailed_tags')
+        tags = {}

-    for span_b in detailed_tags.find_all('span', class_='b'):
-        label = span_b.get_text(strip=True).rstrip(':')
-        next_span = span_b.find_next_sibling('span')
+        for span_b in detailed_tags.find_all('span', class_='b'):
+            label = span_b.get_text(strip=True).rstrip(':')
+            next_span = span_b.find_next_sibling('span')

-        if next_span:
-            tag_list = [a.get_text(strip=True) for a in next_span.find_all('a')]
-        else:
-            tag_list = []
+            if next_span:
+                tag_list = [a.get_text(strip=True) for a in next_span.find_all('a')]
+            else:
+                tag_list = []

-        if label == 'Назначение': label = 'occasion'
-        elif label == 'Основной ингредиент': continue
-        elif label == 'Блюдо': label = 'type_dish'
-        elif label == 'География кухни': label = 'cuisine'
+            if label == 'Назначение': label = 'occasion'
+            elif label == 'Основной ингредиент': continue
+            elif label == 'Блюдо': label = 'type_dish'
+            elif label == 'География кухни': label = 'cuisine'

-        tags[label] = tag_list
+            tags[label] = tag_list

-    return tags
+        return tags
+    except Exception as e:
+        print(e)


 def try_extr_ingredient(span_b, class_, portions=1):
@@ -79,53 +88,60 @@ def try_extr_ingredient(span_b, class_, portions=1):


 def extr_ingredient(main_container):
-    #Сбор ингредиентов
-    portions = int(main_container.find(class_='yield value').get_text(strip=True))
+    try:
+        #Сбор ингредиентов
+        portions = int(main_container.find(class_='yield value').get_text(strip=True))

-    tags = {}
+        tags = {}

-    for span_b in main_container.find_all(class_='ingredient flex-dot-line'):
+        for span_b in main_container.find_all(class_='ingredient flex-dot-line'):

-        label = try_extr_ingredient(span_b, class_='name')
-        value_ingredient = try_extr_ingredient(span_b, 'value', portions)
-        unit_name = try_extr_ingredient(span_b, 'u-unit-name')
+            label = try_extr_ingredient(span_b, class_='name')
+            value_ingredient = try_extr_ingredient(span_b, 'value', portions)
+            unit_name = try_extr_ingredient(span_b, 'u-unit-name')

-        #print(label, value_ingredient, unit_name)
+            #print(label, value_ingredient, unit_name)

-        tags[label] = {'unit':unit_name, 'amount':value_ingredient}
+            tags[label] = {'unit':unit_name, 'amount':value_ingredient}

-    return tags
+        return tags
+    except Exception as e:
+        print(e)

 def extr_steps(main_container):
-    # На сайте есть страницы исключения по шагам готовки. Фото есть не везде, тогда ищем класс detailed_step_description_big noPhotoStep
-    # Класс detailed_step_description_big noPhotoStep ищет текст через get_text(), а не через тег title

-    steps = []
-    count = 1
+    try:
+        # На сайте есть страницы исключения по шагам готовки. Фото есть не везде, тогда ищем класс detailed_step_description_big noPhotoStep
+        # Класс detailed_step_description_big noPhotoStep ищет текст через get_text(), а не через тег title

-    recipeInstructions = main_container.find(class_='instructions')
+        steps = []
+        count = 1

-    main_container = recipeInstructions.find_all(class_='stepphotos')
+        recipeInstructions = main_container.find(class_='instructions')

-    # Проверяем страницу исключение
-    if not main_container:
-        main_container = recipeInstructions.find_all(class_='detailed_step_description_big noPhotoStep')
+        main_container = recipeInstructions.find_all(class_='stepphotos')
+
+        # Проверяем страницу исключение
+        if not main_container:
+            main_container = recipeInstructions.find_all(class_='detailed_step_description_big noPhotoStep')


-    for items in main_container:
-        img = items.get('href')
-        title = items.get('title')
+        for items in main_container:
+            img = items.get('href')
+            title = items.get('title')

-        # Если класс detailed_step_description_big noPhotoStep, то ищем через get_text.  Сейчас title пустой, тк его нет на странице
-        if title is None:
-            title = items.get_text() #Теперь тайтл заполнен
-        print(title)
+            # Если класс detailed_step_description_big noPhotoStep, то ищем через get_text.  Сейчас title пустой, тк его нет на странице
+            if title is None:
+                title = items.get_text() #Теперь тайтл заполнен

-        steps.append({
-            'img': img,
-            'title': title
-        })
-    return steps
+            steps.append({
+                'img': img,
+                'title': title
+            })
+
+        return steps
+    except Exception as e:
+        return None



--- a/import_in_BD.py
+++ b/import_in_BD.py
@@ -1,4 +1,7 @@
 from pymongo import MongoClient
+import log_err as lg
+
+from pymongo import MongoClient, ReplaceOne

 def connect_to_mongo():
    """Подключение к MongoDB"""
@@ -7,11 +10,35 @@ def connect_to_mongo():
    return db["Test"]


-def import_json_in_mongo(data):
+def import_json_in_mongo(recipe_data):
+    """Сохраняет один рецепт (для обратной совместимости и ошибок)"""
    collection = connect_to_mongo()
-    collection.insert_one(data)
-
-
+    try:
+        collection.replace_one({"_id": recipe_data["_id"]}, recipe_data, upsert=True)
+        print(f"✅ Рецепт '{recipe_data.get('recipe_name', recipe_data['_id'])}' успешно сохранён.")
+    except Exception as e:
+        print(f"❌ Ошибка при сохранении рецепта {recipe_data.get('_id')}: {e}")
+        url = recipe_data.get('url', 'unknown')
+        lg.log_error(url, str(e), 'Buff_err.json')
+
+
+def bulk_write_recipes(recipes_list):
+    """Сохраняет список рецептов массово с помощью bulk_write"""
+    if not recipes_list:
+        return
+
+    collection = connect_to_mongo()
+    try:
+        requests = [
+            ReplaceOne({"_id": r["_id"]}, r, upsert=True)
+            for r in recipes_list
+        ]
+        result = collection.bulk_write(requests, ordered=False)
+        print(f"✅ Bulk-запись: {len(recipes_list)} рецептов "
+              f"(upserted: {result.upserted_count}, modified: {result.modified_count})")
+    except Exception as e:
+        print(f"❌ Ошибка при bulk-записи: {e}")
+        # Опционально: можно сохранить recipes_list в файл для повторной обработки



--- a/log_err.py
+++ b/log_err.py
@@ -0,0 +1,39 @@
+import json
+import os
+
+def log_error(url, error, filename='Err.json'):
+    """
+    Добавляет ошибку в список ошибок в JSON-файле.
+
+    Args:
+        url (str): URL, на котором произошла ошибка.
+        error (str or Exception): Описание ошибки.
+        filename (str): Имя файла для логирования (по умолчанию 'Err.json').
+    """
+    # Приводим error к строке, чтобы избежать проблем с сериализацией
+    error_str = str(error)
+
+    # Загружаем существующие ошибки (если файл существует)
+    if os.path.exists(filename):
+        try:
+            with open(filename, 'r', encoding='utf-8') as f:
+                errors = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            errors = []
+    else:
+        errors = []
+
+    # Добавляем новую ошибку
+    errors.append({
+        "url": url.strip(),
+        "error": error_str
+    })
+
+    # Сохраняем обратно
+    with open(filename, 'w', encoding='utf-8') as f:
+        json.dump(errors, f, ensure_ascii=False, indent=4)
+
+
+
+
+
--- a/parser.py
+++ b/parser.py
@@ -1,120 +1,101 @@
 import requests
 from bs4 import BeautifulSoup as bs
-import re
 import function as f
 import json
+import os
+import log_err as lg

 import import_in_BD as ib

 link = 'https://povar.ru/list/'
+buffer = []

+def buffer_import():
+    global buffer

-
-def pars_group(link):
-    #Сбор видов блюд
-
-
-    response = f.try_request(link)
-    soup = bs(response.text, 'html.parser')
-
-    main_container = soup.find_all(class_='ingredientItem')
-
-
-    for items in main_container:
-
-        item = items.find_all('a')
-        title = item[0].get_text()
-
-        if title == 'Салаты': break
-
-        print(title)
-
-        for i in item[1::]:
-            name_group = i.get_text()
-            link_group = 'https://povar.ru' + i.get('href')
-            print('-'*5, name_group, link_group)
-
-            pars_dishs(title, name_group, link_group)
-
-
-
-        print('-'*50)
-
-
-def pars_dishs(title='', name_group='', link='https://povar.ru/list/spagetti/', page=0):
-    #Сбор списка рецептов
-    while True:
-        page += 1
-        new_link = link + str(page)
-        soup = f.try_soup(f.try_request(new_link))
-
-        if soup == False: break
-
-        main_container = soup.find_all(class_='listRecipieTitle')
-
-        for items in main_container:
-            recipe_name = items.get_text()
-            recipe_link = 'https://povar.ru' + items.get('href')
-
-            print(recipe_name, recipe_link)
-            pars_recipie(title, name_group, recipe_name, recipe_link)
-
-        print('-'*50)
-
-
-
-def pars_recipie(title=0, name_group=0, recipe_name=0 ,link='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
-
-    response = f.try_request(link)
-    soup = bs(response.text, 'html.parser')
-
-    main_container = soup.find(class_='cont_area hrecipe')
-
-    name_id = link.split('/')[-1]
    try:
-        name_id = name_id.replace('.html', '')
-    except: pass
-
-    print(name_id)
-
-    photo = main_container.find(class_='photo').get('src')
-
-    recipies = {'recipes': {}}
-
-    detailed_tags = f.extract_tags_from_detailed_tags(main_container) #Собираем теги
-    #print(detailed_tags)
-
-    ingredients = f.extr_ingredient(main_container) #Собираем ингредиенты
-    #print(ingredients)
-
-    calories_info = f.extract_nutrition(main_container.find_all(class_='circle')) #БЖУ
-    #print(calories_info)
-
-    steps = f.extr_steps(main_container) #Сборка шагов
-    #print(steps)
+        ib.bulk_write_recipes(buffer)
+        buffer.clear()
+    except Exception as bulk_e:
+        print(f"⚠️ Bulk-ошибка: {bulk_e}. Переключаемся на поштучную запись...")
+        for recipe in buffer:
+            try:
+                ib.import_json_in_mongo(recipe)  # ← ОДИН рецепт
+            except Exception as single_e:
+                lg.log_error(recipe.get('url', 'unknown'), f"MongoDB-фейл: {single_e}")
+        buffer.clear()


-    recip = {'_id' : name_id,
+
+
+
+def open_url():
+    total_type_recip = json.load(open('unique_urls.json', 'r', encoding='utf-8'))
+
+    for url in total_type_recip:
+        if len(buffer) >= 200:
+            print('❗️ Сейвим', len(buffer))
+            print(buffer)
+            buffer_import()
+
+        pars_recipie(url)
+
+    buffer_import()
+
+
+def pars_recipie(url='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
+
+    try:
+
+        response = f.try_request(url)
+        soup = bs(response.text, 'html.parser')
+
+        recipe_name = soup.find(class_='detailed fn').text
+
+        main_container = soup.find(class_='cont_area hrecipe')
+
+
+        steps = f.extr_steps(main_container) #Сборка шагов
+
+        if steps is None:
+            lg.log_error(url, 'Нет шагов')
+            return None
+
+        name_id = url.split('/')[-1].strip()
+        try:
+            name_id = name_id.replace('.html', '').strip()
+        except: pass
+
+        photo = main_container.find(class_='photo').get('src')
+
+
+        detailed_tags = f.extract_tags_from_detailed_tags(main_container) #Собираем теги
+        ingredients = f.extr_ingredient(main_container) #Собираем ингредиенты
+        calories_info = f.extract_nutrition(main_container.find_all(class_='circle')) #БЖУ
+
+
+        recip = {'_id' : name_id,
                'recipe_name':recipe_name,
-                'url':link,
+                'url':url,
                'preview_img':photo,
                'tags':detailed_tags,
                'ingredients':ingredients,
                'nutritional_value':calories_info,
                'steps':steps}

-    print(recip)
-    print(len(steps))
+        print('⭕',recipe_name)
+        print('🤍',recip)

-    #ib.import_json_in_mongo(recipies)
+        buffer.append(recip)


+    except Exception as e:
+        print(url, e)
+        lg.log_error(url, e)

-#pars_group(link)
-#pars_dishs()
-pars_recipie(link="https://povar.ru/recipes/podjarka_k_makaronam-60879.html")
-
-
+
+#pars_recipie()
+open_url()



--- a/unique_urls.json
+++ b/unique_urls.json
Author	SHA1	Message	Date
zein	12c4dcf33b	Финальная сборка. Собраны все ссылки, удалены дубликаты. Осталось 85к ссылок. Парсим рецепты	2025-11-24 03:24:08 +03:00
zein	8ff6022d8c	Сбор и сохранение рецептом и ссылок в json Без вызова БД	2025-11-23 04:10:34 +03:00
zein	80eab4e9bd	Сбор и сохранение рецептом и ссылок в json	2025-11-23 04:09:34 +03:00