分析彩阳的csdn博客



  • 今天看到了彩阳的博客, https://blog.csdn.net/gamesdev, 写了很多帖子,想统计一下内容,看看到底在说什么

    用python写爬虫,把帖子存到csv文件中

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    #csdn 彩阳 帖子
    # https://blog.csdn.net/gamesdev
    #https://blog.csdn.net/gamesdev/article/details/52684465
    import pandas
    import re
    import requests
    import threading, json, urllib
    from bs4 import BeautifulSoup
    import bs4
    
    def get_one_page(url, data=None, proxy=None, headers=None):
        try:
            s = requests.session()
            s.keep_alive = False
            s.adapters.DEFAULT_RETRIES = 5
            req = s.get(url, timeout=20, headers=headers, data=data, proxies=proxy)
    
            if req.status_code == 200:
                req.encoding='utf-8'
                return req.text
            return None
        except:
            return None
    
    
    def write_csv(file_name, data, columns):
        try:
            file = pandas.DataFrame(data)
            file.to_csv(file_name, index=False, header=False, mode='a+', encoding="utf_8_sig", columns=columns)
        except:
            with open('error.txt', 'a+') as f:
                f.write(data[0].url + '\n')
            print('write error gbk')
    
    allSet = set()
    
    def main(page):
        print('start...')
        url = 'https://blog.csdn.net/gamesdev/article/list/%s?' % str(page)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
            'Cookie': 'uuid_tt_dd=10_7101573470-1521084240690-516780; __yadk_uid=9ZbhXkuRJ7Gw8EwlBeBn8eLfUdqgP39S; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=1788*1*PC_VC; kd_user_id=7774149a-70bb-44c7-9f2b-b8510dafc938; UN=chen_227; BT=1522213721218; UM_distinctid=1626b021cdb1d9-09bc7c571fb7ff-3a614f0b-1fa400-1626b021cdc6d0; bdshare_firstime=1522731959154; CNZZDATA1259587897=424194549-1523582737-https%253A%252F%252Fwww.baidu.com%252F%7C1523582737; smidV2=201805101510249059baa2d89883e782c45d2932b09b2e00fdf043b804a8a00; dc_session_id=10_1526429486171.338580; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1526348349,1526360837,1526362773,1526429439; TY_SESSION_ID=5ed14904-2d62-4f65-a4b3-39ab854135d5; dc_tos=p8sp8v; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1526430501'
        }
    
        html = get_one_page(url, headers=headers)
    
        if html:
            bs4Obj = BeautifulSoup(html, 'lxml')
            divs = bs4Obj.find('div', {'class': 'article-list'})
    
            num = 0
            urlList = []
            for div in divs:
                # print((div))
                if type(div) == None:
                    continue
                elif div == None:
                    continue
                elif repr(div).strip() == '':
                    continue
                else:
                    try:
                        # print(div.a)
                        pattern = re.compile(r'href="(.*?)" ', re.S)
                        dataList = re.findall(pattern, repr(div.a))
    
                        # print(num, dataList[0])
                        num += 1
                        urlList.append(dataList[0])
    
                    except:
                        pass
    
            # print(divs)
    
            num = 0
            for url1 in urlList:
                print('handle page=%s, num=%s' % (str(page), str(num)))
                num += 1
                htmlPage = get_one_page(url1, headers=headers)
                if htmlPage:
                    # print(htmlPage)
                    bs4Obj = BeautifulSoup(htmlPage, 'lxml')
                    article = bs4Obj.find('article')
                    allText = ''
    
                    title = (bs4Obj.find('h6',{'class':'title-article'}).string)
    
                    for con in article.children:
                        if type(con) == bs4.element.NavigableString:
                            if not con.string.strip() == '':
                                allText += (con.string.strip()) + '\n'
                        else:
                            if not con.text.strip() == '':
                                allText += (con.text.strip()) + '\n'
    
                    print(allText)
    
                    columns = ['url', 'title', 'content']
                    write_csv('data.csv', [{'url': url1, 'title': title, 'content': allText}], columns=columns)
                else:
                    print('htmlPage is none')
    
        else:
            print('html is none')
    
    if __name__ == '__main__':
        for page in range(1, 16):
            main(page)
    

    因为乱码问题,将csv文件转成txt

    # -*- coding: utf-8 -*-
    import codecs
    import re
    
    import chardet
    import pandas
    
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    dataList = read_csv('data.csv')
    
    
    for data in dataList:
        da = data[2]
        print(da)
        # print(chardet.detect(da))
        da = da
        with codecs.open('r.txt', 'a+', encoding='gbk') as f:
            try:
                f.write(da+'\n')
            except:
                print('---', da)
    

    使用结巴分词,统计词频,使用词云生成图片,图片形状采用萌梦图片

    # -*- coding: utf-8 -*-
    
    # 导入wordcloud模块和matplotlib模块
    import codecs
    import re
    
    import chardet
    import jieba
    import pandas
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    from snownlp import SnowNLP
    
    def analyCiYun(hktk, bg=None, imgNmae='test.jpg'):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5 ,font_path='C:\\Windows\\Fonts\\STFANGSO.ttf').generate_from_frequencies(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud, interpolation='bilinear')#, interpolation='bilinear'
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file(imgNmae)
    
    
    def analyCiYun2(hktk, bg=None):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5).generate(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud)
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file('test.jpg')
    
    englishChars = ['a','b']
    def guolv(data):
    
        data1 = data.strip(u'!').strip(u',').strip(u'。').strip(u'【').strip(u'】').strip(u'“').strip(u'《').strip(u'》')
        data1 = data1.strip(u':').strip(u'、').strip(u'”').strip(u'?')
    
        data1 = data1.strip(',')
        data1 = data1.strip('.')
        data1 = data1.strip('?')
        data1 = data1.strip('!')
        data1 = data1.strip('\'')
        data1 = data1.strip('"')
        data1 = data1.strip('!')
        data1 = data1.strip('{')
        data1 = data1.strip('}')
    
        #手动过滤
        data1 = data1.strip(u'初始化')
        data1 = data1.strip(u'源代码')
        data1 = data1.strip(u'文件夹')
        data1 = data1.strip(u'应用程序')
        data1 = data1.strip(u'编译器')
        data1 = data1.strip(u'比如说')
        data1 = data1.strip(u'构造函数')
        data1 = data1.strip(u'对话框')
        data1 = data1.strip(u'为什么')
        data1 = data1.strip(u'接下来')
        data1 = data1.strip(u'也就是')
    
        pattern = re.compile(r'([a-zA-Z_-]+)|([0-9_-]+)', re.S)
        data1 = re.sub(pattern, '', data1)
    
        if len(data1) < 3:
            return ''
    
        return data1
    
    
    def cipinCalc(text):
        cutDict = dict()
        cutList = []
    
        # jieba.load_userdict('dict.txt')
        seg = jieba.cut(text)
    
        cutList.append(list(seg))
    
        for c in cutList:
            for c1 in c:
                # print(c1)
    
                # 过滤标点符号
                c1 = guolv(c1)
                if c1.strip() == '':
                    continue
    
                if c1 in cutDict:
                    cutDict[c1] += 1
                else:
                    cutDict[c1] = 1
    
        return cutDict
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    if __name__ == '__main__':
    
        data = ''
        cutList = []
        cutDict = dict()
    
        with codecs.open('r.txt', 'r', encoding='gbk') as f:
            data = f.read()
    
        cutDict = cipinCalc(data)
    
        analyCiYun(cutDict, bg='bg.png')
    
    

    效果

    0_1526449874831_test.jpg

    分析

    文字越大,说明文章中出现词频越高
    可以看到很多文章中涉及坐标系,开发者,摄像机,跨平台的描述



  • 为你点一个赞!这都会融会贯通了,看来做Python的爬网站真的很有意思。


 

走马观花

最近的回复

  • @chinasmu Webkit网络的部分可能不是受到Qt控制的,扩展性较差,所以呢,还不能通过代码的方式侦听网络收发数据。但是WebEngine就可以。如果可以的话,试试Qt WebEngine。😺

    阅读更多
  • C

    我用qt建了个对话框程序,里面加了个webkit,请问有办法获取该控件的全部网络封包通信数据吗?
    不采用windows hook recv和send函数的形式,或者有没有第三方的控件可以这样做

    阅读更多
  • 这个错误不是编译器的错误,而是IntelliSense的错误。
    这种情况,可以不用太担心。

    如果你使用MSVC编译应用程序,那么最好安装Qt Visual Studio Addon,使用这个插件同步开发Qt应用程序。
    不过一个小小的建议,就是Qt Creator写Qt程序非常直观,不需要在Visual Studio中那么麻烦,而且Qt Creator是强制安装的,基本上你安装了Qt 5.12,就可以在安装的目录中找到它。QAxContainer是Qt的模块activeQt中的,需要在pro文件中写QT += axcontainer,才会找到这个类。

    阅读更多
  • C

    VS2017 Qt5.12
    新建一个空的QWidget对话框,编译执行
    可以成功生成并执行,但是错误列表里显示

    傲游截图20190217212931.png

    exe文件能够成功生成。

    另外的问题:我在vs2017编译环境中使用qt,需要用到 QAxContainer,我看帮助文档里说要在.pro中加入CONFIG+=qaxcontainer,但是vs2017创建的程序中没有.pro这个文件,那这句话应该加在哪里呢?

    还有我想建一个有浏览器控件的对话框程序,如果想在5.12版本中加入QtWebKit应该如何使用

    阅读更多

关注我们

微博
QQ群











召唤伊斯特瓦尔