rosi的数据包是通过手机端的HttpCanary找到的,本来是使用Charles、Fiddler的,但是发现找不到,不太好用。于是就使用手机端抓包,直接就找到了数据包

接口:http://rosi.jinyemimi.com/api_beta1_3/api.php,post请求,需携带请求头和参数,参数中包含有页码信息。
import requests,asyncio,aiohttp
import csv,json,os
from asyncio.queues import Queue
task_queue=Queue()
def get_all():
url = 'http://rosi.jinyemimi.com/api_beta1_3/api.php'
headers = {"Accept": "application/json", "urlname": "test", "Content-Length": "127",
"Host": "rosi.jinyemimi.com", "Connection": "Keep-Alive", "Accept-Encoding": "gzip",
"User-Agent": "okhttp/3.8.1", }
for page in range(1, 250):
data = {"action": "getArchiveState", "controller": "archives", "m_mid": 670704, "page": page, "tid": 0,
"token": "3b7b35bf6640ce456babff0c6d434f73"}
response=requests.post(url,headers=headers,data=data).json()
item = {}
item['end_page'] = response['total_page']
for i in response['entries_s']:
item['title'] = i['title']
item['home_img'] = i['litpic']
item['shorttitle'] = i['shorttitle']
if i['source']=='未知':
item['source'] =None
else:
item['source'] = i['source']
yield item
def get_img(item):
img_list= []
all_url={}
if item['source']:
for i in range(1,int(item['source'])+1):
if i < 10:
i = '00' + str(i)
elif 10 <= i and i < 100:
i = '0' + str(i)
else:
i = str(i)
img_url='http://rs.jinyemimi.com/jpg/{0}/{1}.rosi'.format(item['shorttitle'],i)
all_url['home']=item['home_img']
img_list.append(img_url)
all_url['img_url']=img_list
return all_url
def img_list():
for item in get_all():
img_list=get_img(item)
if img_list:
yield img_list
def save_csv():
for i in img_list():
print('正在写入json文件:%s' % i)
with open('rosi.json','a',encoding='utf-8') as f:
f.write(json.dumps(i)+'\n')
save_csv()
这里就是先将所有的图片URL存储为json文件,然后使用异步读取下载:
async def down_and_parse_task(queue):
while 1:
try:
item=queue.get_nowait()
home_url=item['home']
for img_url in item['img_url']:
await file_path(home_url,img_url)
except:
pass
async def file_path(home_url,img_url):
file_name=home_url.split('/')[-1].split('.')[0]
if file_name=='x':
file_name=home_url.split('/')[-2]
img_path = img_url.split('/')[-2]+'-'+img_url.split('/')[-1].split('.')[0]
file_path = 'F:/图片/{0}.jpg'.format(img_path)
await down_img(img_url,file_path)
async def down_img(url,file_path):
print('正在下载图片:{0}'.format(url))
async with aiohttp.ClientSession() as session:
async with await session.get(url=url) as res:
page_content = await res.read()
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(page_content)
async def main():
try:
with open('rosi.json', 'r', encoding='utf-8') as f:
papers = []
for line in f.readlines():
dic = json.loads(line)
papers.append(dic)
for url in papers:
await task_queue.put(url)
except:
pass
for _ in range(20):
loop.create_task(down_and_parse_task(task_queue))
if __name__=='__main__':
loop=asyncio.get_event_loop()
loop.create_task(main())
loop.run_forever() #一直运行
仅作整理
声明:1. 本站所有资源来源于用户上传和网络,因此不包含技术服务请大家谅解!如有侵权请邮件联系客服!
2. 本站不保证所提供下载的资源的准确性、安全性和完整性,资源仅供下载学习之用!如有链接无法下载、失效或广告,请联系客服处理!
3. 您必须在下载后的24个小时之内,从您的电脑中彻底删除上述内容资源!如用于商业或者非法用途,与本站无关,一切后果请用户自负!
2. 本站不保证所提供下载的资源的准确性、安全性和完整性,资源仅供下载学习之用!如有链接无法下载、失效或广告,请联系客服处理!
3. 您必须在下载后的24个小时之内,从您的电脑中彻底删除上述内容资源!如用于商业或者非法用途,与本站无关,一切后果请用户自负!