Финальная сборка.

Собраны все ссылки, удалены дубликаты. Осталось 85к ссылок. Парсим рецепты
2025-11-24 03:24:08 +03:00
parent 8ff6022d8c
commit 12c4dcf33b
5 changed files with 85221 additions and 191 deletions
--- a/parser.py
+++ b/parser.py
@@ -3,166 +3,99 @@ from bs4 import BeautifulSoup as bs
 import function as f
 import json
 import os
+import log_err as lg

 import import_in_BD as ib

 link = 'https://povar.ru/list/'
+buffer = []

+def buffer_import():
+    global buffer

-
-
-
-total_type_recip = {}
-
-def save_to_json(new_data, filename='total_type_recip.json'):
-    # Загружаем существующие данные, если файл существует
-    if os.path.exists(filename):
-        with open(filename, 'r', encoding='utf-8') as f:
-            try:
-                existing_data = json.load(f)
-            except json.JSONDecodeError:
-                existing_data = {}
-    else:
-        existing_data = {}
-
-    # Сливаем new_data в existing_data
-    for category, groups in new_data.items():
-        if category not in existing_data:
-            existing_data[category] = {}
-        for group, recipes in groups.items():
-            # Перезаписываем только если ещё не было или чтобы не дублировать — можно использовать set позже
-            existing_data[category][group] = recipes
-
-    # Сохраняем обратно
-    with open(filename, 'w', encoding='utf-8') as f:
-        json.dump(existing_data, f, ensure_ascii=False, indent=4)
-
-
-
-def pars_group(link):
-    #Сбор видов блюд
-
-
-    response = f.try_request(link)
-    soup = bs(response.text, 'html.parser')
-
-    main_container = soup.find_all(class_='ingredientItem')
-
-    for items in main_container:
-
-        item = items.find_all('a')
-
-        title = items.find_all('h2')
-        title = title[0].get_text()
-
-        #if title == 'Выпечка': break
-
-        # Инициализируем категорию, если ещё не создана
-        if title not in total_type_recip:
-            total_type_recip[title] = {}
-
-        print(title)
-
-        for i in item[1::]:
-            name_group = i.get_text()
-            link_group = 'https://povar.ru' + i.get('href')
-            print('-'*5, name_group, link_group)
-
-            total_type_recip[title][name_group] = []
-
-            pars_dishs(title, name_group, link_group)
-
-
-        print('-'*50)
-
-
-
-
-def pars_dishs(title='', name_group='', link='https://povar.ru/list/spagetti/', page=0):
-
-
-    global total_type_recip
-
-    #Сбор списка рецептов
-    recipes = []
-
-    while True:
-
-        page += 1
-        new_link = link + str(page)
-        soup = f.try_soup(f.try_request(new_link))
-
-        if soup == False: break
-
-
-        main_container = soup.find_all(class_='listRecipieTitle')
-
-        for items in main_container:
-            recipe_name = items.get_text()
-            recipe_link = 'https://povar.ru' + items.get('href')
-
-            print('-'*10,recipe_name, recipe_link)
-
-            #pars_recipie(title, name_group, recipe_name, recipe_link)
-
-            recipes.append({'name': recipe_name, 'url': recipe_link})
-
-
-
-        print('-'*50)
-
-    # После сбора всех страниц — записываем в глобальную структуру
-    total_type_recip[title][name_group] = recipes
-
-    # И сразу сохраняем ВЕСЬ словарь в JSON
-    save_to_json(total_type_recip)
-
-
-
-def pars_recipie(title=0, name_group=0, recipe_name=0 ,link='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
-
-    response = f.try_request(link)
-    soup = bs(response.text, 'html.parser')
-
-    main_container = soup.find(class_='cont_area hrecipe')
-
-    name_id = link.split('/')[-1]
    try:
-        name_id = name_id.replace('.html', '')
-    except: pass
-
-    print(name_id)
-
-    photo = main_container.find(class_='photo').get('src')
-
-    recipies = {'recipes': {}}
-
-    detailed_tags = f.extract_tags_from_detailed_tags(main_container) #Собираем теги
-    ingredients = f.extr_ingredient(main_container) #Собираем ингредиенты
-    calories_info = f.extract_nutrition(main_container.find_all(class_='circle')) #БЖУ
-    steps = f.extr_steps(main_container) #Сборка шагов
+        ib.bulk_write_recipes(buffer)
+        buffer.clear()
+    except Exception as bulk_e:
+        print(f"⚠️ Bulk-ошибка: {bulk_e}. Переключаемся на поштучную запись...")
+        for recipe in buffer:
+            try:
+                ib.import_json_in_mongo(recipe)  # ← ОДИН рецепт
+            except Exception as single_e:
+                lg.log_error(recipe.get('url', 'unknown'), f"MongoDB-фейл: {single_e}")
+        buffer.clear()


-    recip = {'_id' : name_id,
+
+
+
+def open_url():
+    total_type_recip = json.load(open('unique_urls.json', 'r', encoding='utf-8'))
+
+    for url in total_type_recip:
+        if len(buffer) >= 200:
+            print('❗️ Сейвим', len(buffer))
+            print(buffer)
+            buffer_import()
+
+        pars_recipie(url)
+
+    buffer_import()
+
+
+def pars_recipie(url='https://povar.ru/recipes/slivochnaya_karbonara-73186.html'):
+
+    try:
+
+        response = f.try_request(url)
+        soup = bs(response.text, 'html.parser')
+
+        recipe_name = soup.find(class_='detailed fn').text
+
+        main_container = soup.find(class_='cont_area hrecipe')
+
+
+        steps = f.extr_steps(main_container) #Сборка шагов
+
+        if steps is None:
+            lg.log_error(url, 'Нет шагов')
+            return None
+
+        name_id = url.split('/')[-1].strip()
+        try:
+            name_id = name_id.replace('.html', '').strip()
+        except: pass
+
+        photo = main_container.find(class_='photo').get('src')
+
+
+        detailed_tags = f.extract_tags_from_detailed_tags(main_container) #Собираем теги
+        ingredients = f.extr_ingredient(main_container) #Собираем ингредиенты
+        calories_info = f.extract_nutrition(main_container.find_all(class_='circle')) #БЖУ
+
+
+        recip = {'_id' : name_id,
                'recipe_name':recipe_name,
-                'url':link,
+                'url':url,
                'preview_img':photo,
                'tags':detailed_tags,
                'ingredients':ingredients,
                'nutritional_value':calories_info,
                'steps':steps}

-    print('Шагов - ',len(steps))
+        print('⭕',recipe_name)
+        print('🤍',recip)

-    #ib.import_json_in_mongo(recip)
+        buffer.append(recip)


+    except Exception as e:
+        print(url, e)
+        lg.log_error(url, e)
+

-pars_group(link)
-#pars_dishs()
 #pars_recipie()
-
-
+open_url()