【工具】js脚本下载百度文库生成word文本 + python爬取百度文库

js脚本

在开发人员工具中复制粘贴按回车键即可

//1.获取文本
let topDiff = -1;
let content = "";
var filename= document.getElementsByClassName('doc-title')[0].innerText;

const target = document.querySelectorAll(".reader-word-layer"); 
target.forEach(x => {
    if (x.style.top !== topDiff) {
        content += "\n";
        topDiff = x.style.top;
    };
    content += x.innerText;
}); 

//2.创建下载链接,下载到本地
var element = document.createElement('a');
 element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(content));
 element.setAttribute('download', filename+".doc");
 
 element.style.display = ".reader-word-layer";
 document.body.appendChild(element);
 
 element.click();
 
 document.body.removeChild(element);

是之前记录的,原文是其他文章,不记得是哪个了

python爬取

import os
import re
import json
import requests
from urllib.request import urlretrieve


class BaiduWk:
    def __init__(self):
        self.list_info = []
        self.session = requests.session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/80.0.3987.87 Mobile Safari/537.36'}

    # 获取网页源代码的数据
    def get_html(self, start_url):
        response = self.session.get(start_url, headers=self.headers)
        response.encoding = response.apparent_encoding
        return response.text

    # 获取文档标题, 提取请求参数
    def parse_html(self, data):     
        re_title = re.findall("'title': '(.*?)',", data)
        title = re_title[0] if re_title else re.findall('<title>(.*?)</title>', data)[0]
        params = {
            'bucketNum': re.findall(r'"bucketNum":(\d+)', data)[0],
            'md5sum': re.findall('md5sum=(.*?)&', data)[0],
            'sign': re.findall('sign=(.*?)&', data)[0],
            'rtcs_flag': re.findall('rtcs_flag=(.*?)&', data)[0],
            'rtcs_ver': re.findall('rtcs_ver=(.*?)&', data)[0],
            'rsign': re.findall('"rsign":"(.*?)"', data)[0], }
        # 提取页码列表
        page_range = re.findall(r'{"page":\d+,"range":"(.*?)"}', data)
        return params, page_range, title
	
	# 以页码列表依次迭代
    def words_data(self, params, page_range):
        pages = len(page_range) + 1
        url = r'https://wkrtcs.bdimg.com/rtcs/webapp'
        for i in range(1, pages):
            print(f'正在解析第{i}页数据,飞速读取中...')
            # 添加所需的页码信息
            params['pn'] = i
            params['range'] = page_range[i - 1]
            response = self.session.get(url, params=params).text
            yield response

    # 解析文章数据
    def get_words(self, response):
        pages = 1
        for data in response:
            # 转化为json数据
            a = data[5:-1]
            text = ''
            d = json.loads(a)
            # 提取 c键 的文本数据
            for j in d['document.xml']:
                for c in j['c']:
                    text += '\n'
                    for c2 in c['c']:
                        try:
                            text += c2['c'] + '\n'
                        except:
                            continue
            text += f'\n------------------------当前第{pages}页-------------------------\n'
            pages += 1
            self.list_info.append(text)

    # 保存文件
    def save_info(self, title, path):   
        os.makedirs('百度文库', exist_ok=True)
        with open(path, 'w', encoding='utf-8') as f:
            f.writelines(self.list_info)

    def get_img(self, start_url):
        print('开始尝试解析百度文库图片...\n')
        r = self.session.get(start_url)
        r.encoding = r.apparent_encoding
        title = re.findall("'title': '(.*?)'", r.text)[0]
        print(title)
        docId = re.findall("'docId': '(.*?)'", r.text)[0]
        totalPageNum = re.findall("'totalPageNum': '(.*?)'", r.text)[0]
        totalPageNum = int(totalPageNum) + 1

        return totalPageNum, title, docId

    def download_img(self, totalPageNum, title, docId):
        for pn in range(1, totalPageNum):
            params = {'doc_id': docId, 'pn': pn, 'rn': 1, 'type': 'ppt', }
            api_url = 'https://wenku.baidu.com/browse/getrequest'
            r = self.session.get(api_url, params=params, headers=self.headers)
            src = r.json()[0].get('zoom')
            os.makedirs(title, exist_ok=True)
            path = title + '/' + str(pn) + '.jpg'
            urlretrieve(src, path)
            print(f'正在提取第{pn}页,请稍等...')
    
    # 文章去重
    def set_word(self, path):
        word_set = list()
        with open(path, 'r', encoding='utf-8') as f:
            for each_line in f:
                word_set.append(each_line)
        result = list(set(word_set))
        result.sort(key=word_set.index)
        with open(path, 'w', encoding='utf-8') as f:
            f.writelines(result)
            print('done')

	# 获取文字内容
    def run_word(self):
        print('开始尝试解析百度文库页面...\n')
        start_url = input('输入百度文库中的连接:')
        print('running...\n')
        start_url = re.sub('wenku', 'wk', start_url)
        html = self.get_html(start_url)
        param, ranges, title = self.parse_html(html)
        print(f'当前文章:{title}\n')
        path = '百度文库/' + title + '.doc'
        response = self.words_data(param, ranges)
        self.get_words(response)
        self.save_info(title, path)
        self.set_word(path)
        print('done!!!')
        print('程序执行完毕!')

	# 获取图片数据
    def run_img(self):
        print('开始尝试解析百度文库图片信息...\n')
        start_url = input('输入百度文库中的连接:')
        print('running...\n')
        totalPageNum, title, docId = self.get_img(start_url)
        self.download_img(totalPageNum, title, docId)
        print('done!!!')
        print('程序执行完毕!')


if __name__ == '__main__':
    wk = BaiduWk()
    wk.run_word()
    # wk.run_img()

在pycharm里面运行即可,也是很久之前转的代码,由于自己只记录了代码,原文链接没保存…不过之前经常用

更多相关推荐

Python爬取百度文库

Python爬取百度文库爬取网址:https://wenku.baidu.com/view/aa31a84bcf84b9d528ea7a2c.html1....

继续阅读

python可以下载百度文库_python+...

有些时候我们需要用到百度文库的某些文章时,却发现需要会员才能下载,很难受,其实我们可以通...

继续阅读

python爬虫百度文库源码_Python...

首先先分享下github这篇文章吧,参考了部分代码,但我想做一个很详细的解读。新版百度文库爬虫...

继续阅读

Python爬取百度文库并存储为word...

在做爬取数据之前,你需要下载安装两个东西,一个是urllib,另外一个是python-docx。doc是微软...

继续阅读

超简单,一行代码获取百度文库内...

文库类资料应该是不少白嫖党的最爱,本渣渣白嫖怪也不例外,尤其是百度文库,内容多,资料全,...

继续阅读

Python爬虫——爬取百度文库文章

爬取-百度文库中的文章爬取大多数百度文库的文章或图片数据创建了两个调用方法,分别获取文字...

继续阅读

python爬取百度图片之js逆向

开Network页,看到一堆js加密的参数:按,全局搜索字段:点进,按一下大括号搜索字段,在2160行:...

继续阅读

python百度文库文字提取_百度文...

临近毕业,学校要求写实习报告,自己写报告是不可能写的,肯定是抄啊,百度文库能给你白抄么,...

继续阅读

python爬取百度文库DOC文档的简...

谈谈需求百度文库在我们需要查找复制一些文档的时候经常用到,但是,现在的百度文库没以前那么...

继续阅读