Files
Hentai_manga_parser/Serch_H.py

120 lines
2.8 KiB
Python

from bs4 import BeautifulSoup as bs
import requests
import HBD
link = 'https://x8.h-chan.me/manga/newest?offset=' #https://x8.h-chan.me/manga/
result = HBD.find_doc_with_max_id(HBD.connect_to_mongo())
max_id = '51196'
max_num = 29915
def form_date(date_str):
months = {
"января": "01",
"февраля": "02",
"марта": "03",
"апреля": "04",
"мая": "05",
"июня": "06",
"июля": "07",
"августа": "08",
"сентября": "09",
"октября": "10",
"ноября": "11",
"декабря": "12",
}
day, month_str, year = date_str.split()
# Получаем числовое значение месяца
month = months[month_str]
# Формируем дату в нужном формате
formatted_date = f"{day}.{month}.{year}"
return formatted_date
def try_request(link, max_retries=50):
retries = 0
while retries < max_retries:
try:
response = requests.get(link)
if response.status_code == 200:
return response
else:
retries += 1
except:
retries += 1
def pars(link, flag, count):
data_hantai = {}
response = try_request(link)
soup = bs(response.text, 'html.parser')
main_container = soup.find_all(class_='content_row')
for item in main_container:
img = item.find(class_='manga_images')
img = img.find('img')
img = img['src'] if img else None
id = img.split('/')[-2][:10]
row_container = item.find(class_='title_link')
link_manga = 'https://hentaichan.live' + row_container.get('href')
title = row_container.text
tags = item.find(class_='genre')
tags = [tag.strip() for tag in tags.text.split(',')]
date = item.find(class_='row4_right').find('b').text
date = form_date(date)
manga_link = link_manga + '?cacheId=' + id
manga_link = manga_link.replace('/manga/', '/online/')
ID = link_manga.split('/')[-1].split('-')[0]
if int(ID) == int(max_id):
flag = True
return data_hantai, flag
data_hantai[title] = {
'img': img,
'link': link_manga,
'tags': tags,
'date': date,
'manga_link': manga_link,
'original_id' : ID,
}
print(f'стр - {count}', data_hantai)
for i, j in data_hantai.items():
print('-'*10, i, j['manga_link'])
return data_hantai, flag
def get_data():
data = {}
count = 1
for i in range(0, 5000, 20):
flag = False
data_hantai, flag = pars(link + str(i), flag, count)
count +=1
data.update(data_hantai)
if flag is True:
return data