python 爬取影视网站下载链接

 更新时间:2021年05月28日 10:58:51   作者:GriffinLewis2001  
一个简单的爬取影视网站下载链接的爬虫,非常适合新手学习,感兴趣的朋友可以参考下

项目地址:

https://github.com/GriffinLewis2001/Python_movie_links_scraper

运行效果

导入模块

import requests,re
from requests.cookies import RequestsCookieJar
from fake_useragent import UserAgent
import os,pickle,threading,time
import concurrent.futures
from goto import with_goto

爬虫主代码

def get_content_url_name(url):
    send_headers = {
     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"

        }
    cookie_jar = RequestsCookieJar()
    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")
    response=requests.get(url,send_headers,cookies=cookie_jar)
    response.encoding='utf-8'
    content=response.text
    reg=re.compile(r'<a href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  class="thumbnail-img" title="(.*?)"')
    url_name_list=reg.findall(content)
    return url_name_list

def get_content(url):
    send_headers = {
     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"

        }
    cookie_jar = RequestsCookieJar()
    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")
    response=requests.get(url,send_headers,cookies=cookie_jar)
    response.encoding='utf-8'
    return response.text



def search_durl(url):
    content=get_content(url)
    reg=re.compile(r"{'\\x64\\x65\\x63\\x72\\x69\\x70\\x74\\x50\\x61\\x72\\x61\\x6d':'(.*?)'}")
    index=reg.findall(content)[0]
    download_url=url[:-5]+r'/downloadList?decriptParam='+index
    content=get_content(download_url)
    reg1=re.compile(r'title=".*?" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ')
    download_list=reg1.findall(content)
    return download_list
def get_page(url):
    send_headers = {
     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"

        }
    cookie_jar = RequestsCookieJar()
    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")
    response=requests.get(url,send_headers,cookies=cookie_jar)
    response.encoding='utf-8'
    content=response.text
    reg=re.compile(r'<a target="_blank" class="title" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  title="(.*?)">(.*?)<\/a>')
    url_name_list=reg.findall(content)
    return url_name_list
@with_goto
def main():

    print("=========================================================")
    name=input("请输入剧名(输入quit退出):")
    if name == "quit":
        exit()
    url="http://www.yikedy.co/search?query="+name
    dlist=get_page(url)
    print("\n")
    if(dlist):
        num=0
        count=0
        for i in dlist:
            if (name in i[1]) :
                print(f"{num} {i[1]}")
                num+=1
            elif num==0 and count==len(dlist)-1:
                goto .end
            count+=1
        dest=int(input("\n\n请输入剧的编号(输100跳过此次搜寻):"))
        if dest == 100:
            goto .end
        x=0
        print("\n以下为下载链接:\n")
        for i in dlist:
            if (name in i[1]):
                if(x==dest):
                    for durl in search_durl(i[0]):
                        print(f"{durl}\n")

                    print("\n")

                    break
                x+=1

    else:
        label .end
        print("没找到或不想看\n")

完整代码

import requests,re
from requests.cookies import RequestsCookieJar
from fake_useragent import UserAgent
import os,pickle,threading,time
import concurrent.futures
from goto import with_goto

def get_content_url_name(url):
    send_headers = {
     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"

        }
    cookie_jar = RequestsCookieJar()
    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")
    response=requests.get(url,send_headers,cookies=cookie_jar)
    response.encoding='utf-8'
    content=response.text
    reg=re.compile(r'<a href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  class="thumbnail-img" title="(.*?)"')
    url_name_list=reg.findall(content)
    return url_name_list

def get_content(url):
    send_headers = {
     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"

        }
    cookie_jar = RequestsCookieJar()
    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")
    response=requests.get(url,send_headers,cookies=cookie_jar)
    response.encoding='utf-8'
    return response.text



def search_durl(url):
    content=get_content(url)
    reg=re.compile(r"{'\\x64\\x65\\x63\\x72\\x69\\x70\\x74\\x50\\x61\\x72\\x61\\x6d':'(.*?)'}")
    index=reg.findall(content)[0]
    download_url=url[:-5]+r'/downloadList?decriptParam='+index
    content=get_content(download_url)
    reg1=re.compile(r'title=".*?" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ')
    download_list=reg1.findall(content)
    return download_list
def get_page(url):
    send_headers = {
     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"

        }
    cookie_jar = RequestsCookieJar()
    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")
    response=requests.get(url,send_headers,cookies=cookie_jar)
    response.encoding='utf-8'
    content=response.text
    reg=re.compile(r'<a target="_blank" class="title" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  title="(.*?)">(.*?)<\/a>')
    url_name_list=reg.findall(content)
    return url_name_list
@with_goto
def main():

    print("=========================================================")
    name=input("请输入剧名(输入quit退出):")
    if name == "quit":
        exit()
    url="http://www.yikedy.co/search?query="+name
    dlist=get_page(url)
    print("\n")
    if(dlist):
        num=0
        count=0
        for i in dlist:
            if (name in i[1]) :
                print(f"{num} {i[1]}")
                num+=1
            elif num==0 and count==len(dlist)-1:
                goto .end
            count+=1
        dest=int(input("\n\n请输入剧的编号(输100跳过此次搜寻):"))
        if dest == 100:
            goto .end
        x=0
        print("\n以下为下载链接:\n")
        for i in dlist:
            if (name in i[1]):
                if(x==dest):
                    for durl in search_durl(i[0]):
                        print(f"{durl}\n")

                    print("\n")

                    break
                x+=1

    else:
        label .end
        print("没找到或不想看\n")

print("本软件由CLY.所有\n\n")
while(True):
    main()

以上就是python 爬取影视网站下载链接的详细内容,更多关于python 爬取下载链接的资料请关注脚本之家其它相关文章!

相关文章

  • 浅谈Python中的闭包

    浅谈Python中的闭包

    简单说,闭包就是根据不同的配置信息得到不同的结果。再来看看专业的解释:闭包(Closure)是词法闭包(Lexical Closure)的简称,是引用了自由变量的函数。这个被引用的自由变量将和这个函数一同存在,即使已经离开了创造它的环境也不例外。
    2015-07-07
  • 简单利用conda安装tensorflow-gpu=2.2.0的过程及问题解决

    简单利用conda安装tensorflow-gpu=2.2.0的过程及问题解决

    这篇文章主要介绍了简单利用conda安装tensorflow-gpu=2.2.0,本文给大家详细分享问题记录及错误问题解决方案,需要的朋友可以参考下
    2023-01-01
  • Python中创建数值列表的4种方法总结

    Python中创建数值列表的4种方法总结

    在Python中列表(List)是一种有序、可变的数据类型,被广泛用于存储和处理多个元素,这篇文章主要给大家介绍了关于Python中创建数值列表的4种方法,需要的朋友可以参考下
    2024-05-05
  • python json load json 数据后出现乱序的解决方案

    python json load json 数据后出现乱序的解决方案

    今天小编就为大家分享一篇python json load json 数据后出现乱序的解决方案,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧
    2020-02-02
  • Python程序运行原理图文解析

    Python程序运行原理图文解析

    这篇文章主要介绍了Python程序运行原理图文解析,分享了相关代码示例,小编觉得还是挺不错的,具有一定借鉴价值,需要的朋友可以参考下
    2018-02-02
  • python3里gbk编码的问题解决

    python3里gbk编码的问题解决

    本文主要介绍了python3里gbk编码的问题解决,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧
    2022-08-08
  • 使用Python抓取模板之家的CSS模板

    使用Python抓取模板之家的CSS模板

    本文给大家介绍的是使用Python抓取模板之家的CSS模板并打包成zip文件的代码,使用的是单线程,非常简单实用,这里分享给大家,有相同需求的小伙伴参考下吧。
    2015-03-03
  • 解决Linux系统中python matplotlib画图的中文显示问题

    解决Linux系统中python matplotlib画图的中文显示问题

    这篇文章主要介绍了解决Linux系统中python matplotlib画图的中文显示问题,需要的朋友可以参考下
    2017-06-06
  • Python 结合opencv实现图片截取和拼接代码实践

    Python 结合opencv实现图片截取和拼接代码实践

    这篇文章主要介绍了Python 结合opencv实现图片截取和拼接代码实践,本文通过实例代码给大家介绍的非常详细,对大家的学习或工作具有一定的参考借鉴价值,需要的朋友可以参考下
    2023-09-09
  • Python通过psd-tools解析PSD文件的实现

    Python通过psd-tools解析PSD文件的实现

    本文主要介绍了Python通过psd-tools解析PSD文件的实现,主要包括如何获取PSD文件的基本信息、遍历图层、提取图层详细信息、保存和创建PSD文件,感兴趣的可以了解一下
    2023-12-12

最新评论