分析彩阳的csdn博客



  • 今天看到了彩阳的博客, https://blog.csdn.net/gamesdev, 写了很多帖子,想统计一下内容,看看到底在说什么

    用python写爬虫,把帖子存到csv文件中

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    #csdn 彩阳 帖子
    # https://blog.csdn.net/gamesdev
    #https://blog.csdn.net/gamesdev/article/details/52684465
    import pandas
    import re
    import requests
    import threading, json, urllib
    from bs4 import BeautifulSoup
    import bs4
    
    def get_one_page(url, data=None, proxy=None, headers=None):
        try:
            s = requests.session()
            s.keep_alive = False
            s.adapters.DEFAULT_RETRIES = 5
            req = s.get(url, timeout=20, headers=headers, data=data, proxies=proxy)
    
            if req.status_code == 200:
                req.encoding='utf-8'
                return req.text
            return None
        except:
            return None
    
    
    def write_csv(file_name, data, columns):
        try:
            file = pandas.DataFrame(data)
            file.to_csv(file_name, index=False, header=False, mode='a+', encoding="utf_8_sig", columns=columns)
        except:
            with open('error.txt', 'a+') as f:
                f.write(data[0].url + '\n')
            print('write error gbk')
    
    allSet = set()
    
    def main(page):
        print('start...')
        url = 'https://blog.csdn.net/gamesdev/article/list/%s?' % str(page)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
            'Cookie': 'uuid_tt_dd=10_7101573470-1521084240690-516780; __yadk_uid=9ZbhXkuRJ7Gw8EwlBeBn8eLfUdqgP39S; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=1788*1*PC_VC; kd_user_id=7774149a-70bb-44c7-9f2b-b8510dafc938; UN=chen_227; BT=1522213721218; UM_distinctid=1626b021cdb1d9-09bc7c571fb7ff-3a614f0b-1fa400-1626b021cdc6d0; bdshare_firstime=1522731959154; CNZZDATA1259587897=424194549-1523582737-https%253A%252F%252Fwww.baidu.com%252F%7C1523582737; smidV2=201805101510249059baa2d89883e782c45d2932b09b2e00fdf043b804a8a00; dc_session_id=10_1526429486171.338580; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1526348349,1526360837,1526362773,1526429439; TY_SESSION_ID=5ed14904-2d62-4f65-a4b3-39ab854135d5; dc_tos=p8sp8v; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1526430501'
        }
    
        html = get_one_page(url, headers=headers)
    
        if html:
            bs4Obj = BeautifulSoup(html, 'lxml')
            divs = bs4Obj.find('div', {'class': 'article-list'})
    
            num = 0
            urlList = []
            for div in divs:
                # print((div))
                if type(div) == None:
                    continue
                elif div == None:
                    continue
                elif repr(div).strip() == '':
                    continue
                else:
                    try:
                        # print(div.a)
                        pattern = re.compile(r'href="(.*?)" ', re.S)
                        dataList = re.findall(pattern, repr(div.a))
    
                        # print(num, dataList[0])
                        num += 1
                        urlList.append(dataList[0])
    
                    except:
                        pass
    
            # print(divs)
    
            num = 0
            for url1 in urlList:
                print('handle page=%s, num=%s' % (str(page), str(num)))
                num += 1
                htmlPage = get_one_page(url1, headers=headers)
                if htmlPage:
                    # print(htmlPage)
                    bs4Obj = BeautifulSoup(htmlPage, 'lxml')
                    article = bs4Obj.find('article')
                    allText = ''
    
                    title = (bs4Obj.find('h6',{'class':'title-article'}).string)
    
                    for con in article.children:
                        if type(con) == bs4.element.NavigableString:
                            if not con.string.strip() == '':
                                allText += (con.string.strip()) + '\n'
                        else:
                            if not con.text.strip() == '':
                                allText += (con.text.strip()) + '\n'
    
                    print(allText)
    
                    columns = ['url', 'title', 'content']
                    write_csv('data.csv', [{'url': url1, 'title': title, 'content': allText}], columns=columns)
                else:
                    print('htmlPage is none')
    
        else:
            print('html is none')
    
    if __name__ == '__main__':
        for page in range(1, 16):
            main(page)
    

    因为乱码问题,将csv文件转成txt

    # -*- coding: utf-8 -*-
    import codecs
    import re
    
    import chardet
    import pandas
    
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    dataList = read_csv('data.csv')
    
    
    for data in dataList:
        da = data[2]
        print(da)
        # print(chardet.detect(da))
        da = da
        with codecs.open('r.txt', 'a+', encoding='gbk') as f:
            try:
                f.write(da+'\n')
            except:
                print('---', da)
    

    使用结巴分词,统计词频,使用词云生成图片,图片形状采用萌梦图片

    # -*- coding: utf-8 -*-
    
    # 导入wordcloud模块和matplotlib模块
    import codecs
    import re
    
    import chardet
    import jieba
    import pandas
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    from snownlp import SnowNLP
    
    def analyCiYun(hktk, bg=None, imgNmae='test.jpg'):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5 ,font_path='C:\\Windows\\Fonts\\STFANGSO.ttf').generate_from_frequencies(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud, interpolation='bilinear')#, interpolation='bilinear'
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file(imgNmae)
    
    
    def analyCiYun2(hktk, bg=None):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5).generate(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud)
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file('test.jpg')
    
    englishChars = ['a','b']
    def guolv(data):
    
        data1 = data.strip(u'!').strip(u',').strip(u'。').strip(u'【').strip(u'】').strip(u'“').strip(u'《').strip(u'》')
        data1 = data1.strip(u':').strip(u'、').strip(u'”').strip(u'?')
    
        data1 = data1.strip(',')
        data1 = data1.strip('.')
        data1 = data1.strip('?')
        data1 = data1.strip('!')
        data1 = data1.strip('\'')
        data1 = data1.strip('"')
        data1 = data1.strip('!')
        data1 = data1.strip('{')
        data1 = data1.strip('}')
    
        #手动过滤
        data1 = data1.strip(u'初始化')
        data1 = data1.strip(u'源代码')
        data1 = data1.strip(u'文件夹')
        data1 = data1.strip(u'应用程序')
        data1 = data1.strip(u'编译器')
        data1 = data1.strip(u'比如说')
        data1 = data1.strip(u'构造函数')
        data1 = data1.strip(u'对话框')
        data1 = data1.strip(u'为什么')
        data1 = data1.strip(u'接下来')
        data1 = data1.strip(u'也就是')
    
        pattern = re.compile(r'([a-zA-Z_-]+)|([0-9_-]+)', re.S)
        data1 = re.sub(pattern, '', data1)
    
        if len(data1) < 3:
            return ''
    
        return data1
    
    
    def cipinCalc(text):
        cutDict = dict()
        cutList = []
    
        # jieba.load_userdict('dict.txt')
        seg = jieba.cut(text)
    
        cutList.append(list(seg))
    
        for c in cutList:
            for c1 in c:
                # print(c1)
    
                # 过滤标点符号
                c1 = guolv(c1)
                if c1.strip() == '':
                    continue
    
                if c1 in cutDict:
                    cutDict[c1] += 1
                else:
                    cutDict[c1] = 1
    
        return cutDict
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    if __name__ == '__main__':
    
        data = ''
        cutList = []
        cutDict = dict()
    
        with codecs.open('r.txt', 'r', encoding='gbk') as f:
            data = f.read()
    
        cutDict = cipinCalc(data)
    
        analyCiYun(cutDict, bg='bg.png')
    
    

    效果

    0_1526449874831_test.jpg

    分析

    文字越大,说明文章中出现词频越高
    可以看到很多文章中涉及坐标系,开发者,摄像机,跨平台的描述



  • 为你点一个赞!这都会融会贯通了,看来做Python的爬网站真的很有意思。


Log in to reply
 

走马观花

最近的回复

  • G

    北京名律免费法律咨询,解决您的法律困扰,如果您眼下没有遇到法律上的问题,也可以留一位大律师的联系方式以备不时之需!ccfd91efc3a6b2ed0e79054d2248eed.jpg bf6015b383483f1fe83cdcfd130fc6b.jpg 2f347879129add1096bf3424edbe517.jpg

    read more
  • G

    北京名律免费法律咨询,解决您的法律困扰,如果您眼下没有遇到法律上的问题,也可以留一位大律师的联系方式以备不时之需!微信图片_20200210141336.png 微信图片_20200210141333.png 微信图片_20200210141329.png

    read more
  • G

    北京名律免费法律咨询,解决您的法律困扰,如果您眼下没有遇到法律上的问题,也可以留一位大律师的联系方式以备不时之需!

    read more

关注我们

微博
QQ群