120 lines
2.8 KiB
Python
120 lines
2.8 KiB
Python
from bs4 import BeautifulSoup as bs
|
|
import requests
|
|
import HBD
|
|
|
|
link = 'https://x8.h-chan.me/manga/newest?offset=' #https://x8.h-chan.me/manga/
|
|
result = HBD.find_doc_with_max_id(HBD.connect_to_mongo())
|
|
|
|
max_id = '51196'
|
|
max_num = 29915
|
|
|
|
|
|
def form_date(date_str):
|
|
months = {
|
|
"января": "01",
|
|
"февраля": "02",
|
|
"марта": "03",
|
|
"апреля": "04",
|
|
"мая": "05",
|
|
"июня": "06",
|
|
"июля": "07",
|
|
"августа": "08",
|
|
"сентября": "09",
|
|
"октября": "10",
|
|
"ноября": "11",
|
|
"декабря": "12",
|
|
}
|
|
|
|
day, month_str, year = date_str.split()
|
|
|
|
# Получаем числовое значение месяца
|
|
month = months[month_str]
|
|
|
|
# Формируем дату в нужном формате
|
|
formatted_date = f"{day}.{month}.{year}"
|
|
|
|
return formatted_date
|
|
|
|
def try_request(link, max_retries=50):
|
|
retries = 0
|
|
while retries < max_retries:
|
|
try:
|
|
response = requests.get(link)
|
|
|
|
if response.status_code == 200:
|
|
return response
|
|
else:
|
|
retries += 1
|
|
except:
|
|
retries += 1
|
|
|
|
def pars(link, flag, count):
|
|
data_hantai = {}
|
|
|
|
response = try_request(link)
|
|
|
|
soup = bs(response.text, 'html.parser')
|
|
main_container = soup.find_all(class_='content_row')
|
|
|
|
for item in main_container:
|
|
img = item.find(class_='manga_images')
|
|
img = img.find('img')
|
|
img = img['src'] if img else None
|
|
|
|
id = img.split('/')[-2][:10]
|
|
|
|
row_container = item.find(class_='title_link')
|
|
link_manga = 'https://hentaichan.live' + row_container.get('href')
|
|
|
|
title = row_container.text
|
|
|
|
tags = item.find(class_='genre')
|
|
tags = [tag.strip() for tag in tags.text.split(',')]
|
|
|
|
|
|
|
|
date = item.find(class_='row4_right').find('b').text
|
|
date = form_date(date)
|
|
|
|
manga_link = link_manga + '?cacheId=' + id
|
|
manga_link = manga_link.replace('/manga/', '/online/')
|
|
|
|
ID = link_manga.split('/')[-1].split('-')[0]
|
|
|
|
if int(ID) == int(max_id):
|
|
flag = True
|
|
return data_hantai, flag
|
|
|
|
data_hantai[title] = {
|
|
'img': img,
|
|
'link': link_manga,
|
|
'tags': tags,
|
|
'date': date,
|
|
'manga_link': manga_link,
|
|
'original_id' : ID,
|
|
}
|
|
|
|
print(f'стр - {count}', data_hantai)
|
|
|
|
for i, j in data_hantai.items():
|
|
print('-'*10, i, j['manga_link'])
|
|
|
|
return data_hantai, flag
|
|
|
|
def get_data():
|
|
data = {}
|
|
count = 1
|
|
|
|
for i in range(0, 5000, 20):
|
|
|
|
flag = False
|
|
data_hantai, flag = pars(link + str(i), flag, count)
|
|
count +=1
|
|
|
|
data.update(data_hantai)
|
|
|
|
if flag is True:
|
|
return data
|
|
|
|
|