分析彩阳的csdn博客



  • 今天看到了彩阳的博客, https://blog.csdn.net/gamesdev, 写了很多帖子,想统计一下内容,看看到底在说什么

    用python写爬虫,把帖子存到csv文件中

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    #csdn 彩阳 帖子
    # https://blog.csdn.net/gamesdev
    #https://blog.csdn.net/gamesdev/article/details/52684465
    import pandas
    import re
    import requests
    import threading, json, urllib
    from bs4 import BeautifulSoup
    import bs4
    
    def get_one_page(url, data=None, proxy=None, headers=None):
        try:
            s = requests.session()
            s.keep_alive = False
            s.adapters.DEFAULT_RETRIES = 5
            req = s.get(url, timeout=20, headers=headers, data=data, proxies=proxy)
    
            if req.status_code == 200:
                req.encoding='utf-8'
                return req.text
            return None
        except:
            return None
    
    
    def write_csv(file_name, data, columns):
        try:
            file = pandas.DataFrame(data)
            file.to_csv(file_name, index=False, header=False, mode='a+', encoding="utf_8_sig", columns=columns)
        except:
            with open('error.txt', 'a+') as f:
                f.write(data[0].url + '\n')
            print('write error gbk')
    
    allSet = set()
    
    def main(page):
        print('start...')
        url = 'https://blog.csdn.net/gamesdev/article/list/%s?' % str(page)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
            'Cookie': 'uuid_tt_dd=10_7101573470-1521084240690-516780; __yadk_uid=9ZbhXkuRJ7Gw8EwlBeBn8eLfUdqgP39S; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=1788*1*PC_VC; kd_user_id=7774149a-70bb-44c7-9f2b-b8510dafc938; UN=chen_227; BT=1522213721218; UM_distinctid=1626b021cdb1d9-09bc7c571fb7ff-3a614f0b-1fa400-1626b021cdc6d0; bdshare_firstime=1522731959154; CNZZDATA1259587897=424194549-1523582737-https%253A%252F%252Fwww.baidu.com%252F%7C1523582737; smidV2=201805101510249059baa2d89883e782c45d2932b09b2e00fdf043b804a8a00; dc_session_id=10_1526429486171.338580; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1526348349,1526360837,1526362773,1526429439; TY_SESSION_ID=5ed14904-2d62-4f65-a4b3-39ab854135d5; dc_tos=p8sp8v; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1526430501'
        }
    
        html = get_one_page(url, headers=headers)
    
        if html:
            bs4Obj = BeautifulSoup(html, 'lxml')
            divs = bs4Obj.find('div', {'class': 'article-list'})
    
            num = 0
            urlList = []
            for div in divs:
                # print((div))
                if type(div) == None:
                    continue
                elif div == None:
                    continue
                elif repr(div).strip() == '':
                    continue
                else:
                    try:
                        # print(div.a)
                        pattern = re.compile(r'href="(.*?)" ', re.S)
                        dataList = re.findall(pattern, repr(div.a))
    
                        # print(num, dataList[0])
                        num += 1
                        urlList.append(dataList[0])
    
                    except:
                        pass
    
            # print(divs)
    
            num = 0
            for url1 in urlList:
                print('handle page=%s, num=%s' % (str(page), str(num)))
                num += 1
                htmlPage = get_one_page(url1, headers=headers)
                if htmlPage:
                    # print(htmlPage)
                    bs4Obj = BeautifulSoup(htmlPage, 'lxml')
                    article = bs4Obj.find('article')
                    allText = ''
    
                    title = (bs4Obj.find('h6',{'class':'title-article'}).string)
    
                    for con in article.children:
                        if type(con) == bs4.element.NavigableString:
                            if not con.string.strip() == '':
                                allText += (con.string.strip()) + '\n'
                        else:
                            if not con.text.strip() == '':
                                allText += (con.text.strip()) + '\n'
    
                    print(allText)
    
                    columns = ['url', 'title', 'content']
                    write_csv('data.csv', [{'url': url1, 'title': title, 'content': allText}], columns=columns)
                else:
                    print('htmlPage is none')
    
        else:
            print('html is none')
    
    if __name__ == '__main__':
        for page in range(1, 16):
            main(page)
    

    因为乱码问题,将csv文件转成txt

    # -*- coding: utf-8 -*-
    import codecs
    import re
    
    import chardet
    import pandas
    
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    dataList = read_csv('data.csv')
    
    
    for data in dataList:
        da = data[2]
        print(da)
        # print(chardet.detect(da))
        da = da
        with codecs.open('r.txt', 'a+', encoding='gbk') as f:
            try:
                f.write(da+'\n')
            except:
                print('---', da)
    

    使用结巴分词,统计词频,使用词云生成图片,图片形状采用萌梦图片

    # -*- coding: utf-8 -*-
    
    # 导入wordcloud模块和matplotlib模块
    import codecs
    import re
    
    import chardet
    import jieba
    import pandas
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    from snownlp import SnowNLP
    
    def analyCiYun(hktk, bg=None, imgNmae='test.jpg'):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5 ,font_path='C:\\Windows\\Fonts\\STFANGSO.ttf').generate_from_frequencies(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud, interpolation='bilinear')#, interpolation='bilinear'
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file(imgNmae)
    
    
    def analyCiYun2(hktk, bg=None):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5).generate(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud)
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file('test.jpg')
    
    englishChars = ['a','b']
    def guolv(data):
    
        data1 = data.strip(u'!').strip(u',').strip(u'。').strip(u'【').strip(u'】').strip(u'“').strip(u'《').strip(u'》')
        data1 = data1.strip(u':').strip(u'、').strip(u'”').strip(u'?')
    
        data1 = data1.strip(',')
        data1 = data1.strip('.')
        data1 = data1.strip('?')
        data1 = data1.strip('!')
        data1 = data1.strip('\'')
        data1 = data1.strip('"')
        data1 = data1.strip('!')
        data1 = data1.strip('{')
        data1 = data1.strip('}')
    
        #手动过滤
        data1 = data1.strip(u'初始化')
        data1 = data1.strip(u'源代码')
        data1 = data1.strip(u'文件夹')
        data1 = data1.strip(u'应用程序')
        data1 = data1.strip(u'编译器')
        data1 = data1.strip(u'比如说')
        data1 = data1.strip(u'构造函数')
        data1 = data1.strip(u'对话框')
        data1 = data1.strip(u'为什么')
        data1 = data1.strip(u'接下来')
        data1 = data1.strip(u'也就是')
    
        pattern = re.compile(r'([a-zA-Z_-]+)|([0-9_-]+)', re.S)
        data1 = re.sub(pattern, '', data1)
    
        if len(data1) < 3:
            return ''
    
        return data1
    
    
    def cipinCalc(text):
        cutDict = dict()
        cutList = []
    
        # jieba.load_userdict('dict.txt')
        seg = jieba.cut(text)
    
        cutList.append(list(seg))
    
        for c in cutList:
            for c1 in c:
                # print(c1)
    
                # 过滤标点符号
                c1 = guolv(c1)
                if c1.strip() == '':
                    continue
    
                if c1 in cutDict:
                    cutDict[c1] += 1
                else:
                    cutDict[c1] = 1
    
        return cutDict
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    if __name__ == '__main__':
    
        data = ''
        cutList = []
        cutDict = dict()
    
        with codecs.open('r.txt', 'r', encoding='gbk') as f:
            data = f.read()
    
        cutDict = cipinCalc(data)
    
        analyCiYun(cutDict, bg='bg.png')
    
    

    效果

    0_1526449874831_test.jpg

    分析

    文字越大,说明文章中出现词频越高
    可以看到很多文章中涉及坐标系,开发者,摄像机,跨平台的描述



  • 为你点一个赞!这都会融会贯通了,看来做Python的爬网站真的很有意思。


Log in to reply
 

走马观花

最近的回复

  • C

    Qt for MCU需要商业授权的

    read more
  • Qt for MCUs

    搭建Qt for MCUs PC端开发环境。qt for mcus提供了一个完整的图形框架和工具包,包含了在MCUs上设计、开发和部署gui所需的一切。它允许您在裸机或实时操作系统上运行应用程序。

    先决条件

    开发主机环境支持仅限于Windows 10

    MSVC compiler v19.16 (Visual Studio 2017 15.9.9 or newer) x64

    CMake v3.13 or newer (you can install it using the Qt Online installer) x64

    使用Qt联机安装程序安装Qt for MCUs,该安装程序可通过Qt帐户下载

    安装Qt 5.14和Qt Creator 4.11 or higher

    安装链接

    › Qt: https://account.qt.io/downloads
    › CMake: https://cmake.org/download/
    › Python 2.7 32-bit: https://www.python.org/downloads/release/python-2716/
    › Arm GCC: https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnutoolchain/gnu-rm/downloads
    › J-Link Software Pack: https://www.segger.com/downloads/jlink/JLink_Windows.exe
    › J-Link OpenSDA Firmware: https://www.segger.com/downloads/jlink/OpenSDA_MIMXRT1050-EVKHyperflash
    › STM32CubeProgrammer: https://www.st.com/en/development-tools/stm32cubeprog.html
    › STM32 ST-LINK Utility: https://www.st.com/en/development-tools/stsw-link004.html​​​​​​​

    Qt Creator设置 启用Qt Creator插件 选择“帮助>关于插件”,然后从列表中选择“MCU支持(实验性)”插件,重新启动Qt Creator以应用更改
    替代文字 为MCU创建Qt工具包

    选择工具>选项>设备>MCU

    选择Qt for MCUs-Desktop 32bpp作为目标

    如果尚未设置,请提供Qt for MCUs安装目录的路径。

    单击Apply应用。

    替代文字

    替代文字
    替代文字

    注意:

    编译器要选X64,Qt版本要选64bit,CMake Tool选x64

    打开恒温器项目demo

    选择文件>打开文件或项目。。。

    打开CMakefiles.txt文件来自thermo文件夹的文件。

    选择Qt作为MCU-桌面32bpp套件。

    单击“配置项目”以完成。

    替代文字

    问题

    开发主机环境支持仅限于Windows 10

    C++编译失败,文本大字体.pixelSize.

    文本类型无法正确呈现需要复杂文本布局的unicode序列。对复杂文本使用StaticText

    read more
  • H

    hi 有问题请教你,方便加个联系方式吗

    read more
  • boost.asio是一个很棒的网络库,这回儿我也开始系统地学习起来了。想想当年接触boost,也有八年多了。这次开始接触boost,觉得既熟悉又陌生。熟悉的是小写字母+下划线的命名方式、晦涩的模板、很慢的编译速度以及较大的程序体积,陌生的是asio的各种概念:io服务、接收器、套接字等等:我之前对网络编程不是非常了解。

    于是根据我的理解,参考《Boost.Asio C++网络编程》实现了这样一个简单的客户端和服务端通信的例子,例子非常简单,还不完善,但是幸运的是,可以在本机上互通了。
    下面是客户端的代码:

    #include <iostream> #include <boost/asio.hpp> #include <boost/proto/detail/ignore_unused.hpp> using namespace std; using namespace boost::asio; using namespace boost::system; using namespace boost::proto::detail;// 提供ignore_unused方法 void writeHandler( const boost::system::error_code& ec, size_t bytesTransferred ) { if ( ec ) { cout << "Write data error, code: " << ec.value( ) << "transferred: " << bytesTransferred << endl; } else { cout << "OK! " << bytesTransferred << "bytes written. " << endl; } } int main(int argc, char *argv[]) { ignore_unused( argc ); ignore_unused( argv ); io_service service; ip::tcp::socket sock( service ); ip::tcp::endpoint ep( ip::address::from_string( "127.0.0.1" ), 6545 ); boost::system::error_code ec; sock.connect( ep, ec ); if ( ec ) { cout << "Connect error, code: " << ec.value( ) << ", We will exit." << endl; return ec.value( ); } else { char buf[1024] = "Hello world!"; sock.async_write_some( buffer( buf ), writeHandler ); sock.close( ); } return service.run( ); }

    下面是服务端的代码:

    #include <iostream> #include <boost/asio.hpp> #include <boost/proto/detail/ignore_unused.hpp> using namespace std; using namespace boost::asio; using namespace boost::system; using namespace boost::proto::detail;// 提供ignore_unused方法 void acceptHandle( const boost::system::error_code& code ) { cout << "Accepted." << endl; } int main(int argc, char *argv[]) { ignore_unused( argc ); ignore_unused( argv ); io_service service; ip::tcp::endpoint ep( ip::address::from_string( "127.0.0.1" ), 6545 ); boost::system::error_code ec; ip::tcp::socket sock( service ); ip::tcp::acceptor acceptor( service, ep ); acceptor.async_accept( sock, acceptHandle ); if ( ec ) { cout << "There is an error in server. code: " << ec.value( ) << endl; } return service.run( );// 阻塞运行 }

    运行结果是这样的:
    78448d7b-b3ae-42fc-9e2e-4dd2fbdac2c2-image.png

    我对boost.asio中几个概念的理解:

    io_service,这就是一个类似事件循环的东西,它为io设备提供服务,故名。不管是套接字、文件还是串口设备,都要使用它的服务。它的run()函数相当于启动了一个事件循环。一旦有消息了,即进行响应。这也是实现异步编程的重要基础。 socket,这个类则是套接字,可以处理TCP或者是UDP请求。有同步以及异步的处理方式,也有带异常以及不带异常的处理方式。 acceptor,接收器,仅仅是服务端使用。相当于其余框架中的listener,作接收用的。

    比较浅显,如果有不当之处,敬请指正。

    read more

关注我们

微博
QQ群