分析彩阳的csdn博客



  • 今天看到了彩阳的博客, https://blog.csdn.net/gamesdev, 写了很多帖子,想统计一下内容,看看到底在说什么

    用python写爬虫,把帖子存到csv文件中

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    #csdn 彩阳 帖子
    # https://blog.csdn.net/gamesdev
    #https://blog.csdn.net/gamesdev/article/details/52684465
    import pandas
    import re
    import requests
    import threading, json, urllib
    from bs4 import BeautifulSoup
    import bs4
    
    def get_one_page(url, data=None, proxy=None, headers=None):
        try:
            s = requests.session()
            s.keep_alive = False
            s.adapters.DEFAULT_RETRIES = 5
            req = s.get(url, timeout=20, headers=headers, data=data, proxies=proxy)
    
            if req.status_code == 200:
                req.encoding='utf-8'
                return req.text
            return None
        except:
            return None
    
    
    def write_csv(file_name, data, columns):
        try:
            file = pandas.DataFrame(data)
            file.to_csv(file_name, index=False, header=False, mode='a+', encoding="utf_8_sig", columns=columns)
        except:
            with open('error.txt', 'a+') as f:
                f.write(data[0].url + '\n')
            print('write error gbk')
    
    allSet = set()
    
    def main(page):
        print('start...')
        url = 'https://blog.csdn.net/gamesdev/article/list/%s?' % str(page)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
            'Cookie': 'uuid_tt_dd=10_7101573470-1521084240690-516780; __yadk_uid=9ZbhXkuRJ7Gw8EwlBeBn8eLfUdqgP39S; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=1788*1*PC_VC; kd_user_id=7774149a-70bb-44c7-9f2b-b8510dafc938; UN=chen_227; BT=1522213721218; UM_distinctid=1626b021cdb1d9-09bc7c571fb7ff-3a614f0b-1fa400-1626b021cdc6d0; bdshare_firstime=1522731959154; CNZZDATA1259587897=424194549-1523582737-https%253A%252F%252Fwww.baidu.com%252F%7C1523582737; smidV2=201805101510249059baa2d89883e782c45d2932b09b2e00fdf043b804a8a00; dc_session_id=10_1526429486171.338580; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1526348349,1526360837,1526362773,1526429439; TY_SESSION_ID=5ed14904-2d62-4f65-a4b3-39ab854135d5; dc_tos=p8sp8v; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1526430501'
        }
    
        html = get_one_page(url, headers=headers)
    
        if html:
            bs4Obj = BeautifulSoup(html, 'lxml')
            divs = bs4Obj.find('div', {'class': 'article-list'})
    
            num = 0
            urlList = []
            for div in divs:
                # print((div))
                if type(div) == None:
                    continue
                elif div == None:
                    continue
                elif repr(div).strip() == '':
                    continue
                else:
                    try:
                        # print(div.a)
                        pattern = re.compile(r'href="(.*?)" ', re.S)
                        dataList = re.findall(pattern, repr(div.a))
    
                        # print(num, dataList[0])
                        num += 1
                        urlList.append(dataList[0])
    
                    except:
                        pass
    
            # print(divs)
    
            num = 0
            for url1 in urlList:
                print('handle page=%s, num=%s' % (str(page), str(num)))
                num += 1
                htmlPage = get_one_page(url1, headers=headers)
                if htmlPage:
                    # print(htmlPage)
                    bs4Obj = BeautifulSoup(htmlPage, 'lxml')
                    article = bs4Obj.find('article')
                    allText = ''
    
                    title = (bs4Obj.find('h6',{'class':'title-article'}).string)
    
                    for con in article.children:
                        if type(con) == bs4.element.NavigableString:
                            if not con.string.strip() == '':
                                allText += (con.string.strip()) + '\n'
                        else:
                            if not con.text.strip() == '':
                                allText += (con.text.strip()) + '\n'
    
                    print(allText)
    
                    columns = ['url', 'title', 'content']
                    write_csv('data.csv', [{'url': url1, 'title': title, 'content': allText}], columns=columns)
                else:
                    print('htmlPage is none')
    
        else:
            print('html is none')
    
    if __name__ == '__main__':
        for page in range(1, 16):
            main(page)
    

    因为乱码问题,将csv文件转成txt

    # -*- coding: utf-8 -*-
    import codecs
    import re
    
    import chardet
    import pandas
    
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    dataList = read_csv('data.csv')
    
    
    for data in dataList:
        da = data[2]
        print(da)
        # print(chardet.detect(da))
        da = da
        with codecs.open('r.txt', 'a+', encoding='gbk') as f:
            try:
                f.write(da+'\n')
            except:
                print('---', da)
    

    使用结巴分词,统计词频,使用词云生成图片,图片形状采用萌梦图片

    # -*- coding: utf-8 -*-
    
    # 导入wordcloud模块和matplotlib模块
    import codecs
    import re
    
    import chardet
    import jieba
    import pandas
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    from snownlp import SnowNLP
    
    def analyCiYun(hktk, bg=None, imgNmae='test.jpg'):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5 ,font_path='C:\\Windows\\Fonts\\STFANGSO.ttf').generate_from_frequencies(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud, interpolation='bilinear')#, interpolation='bilinear'
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file(imgNmae)
    
    
    def analyCiYun2(hktk, bg=None):
        # print(hktk)
        # 读入背景图片
        bg_pic = None
        if bg is not None:
            bg_pic = imread(bg)
    
        wordcloud = WordCloud(mask=bg_pic, background_color='white', scale=1.5).generate(hktk)
    
        if bg_pic is not None:
            image_colors = ImageColorGenerator(bg_pic)
    
        # 显示词云图片
        plt.imshow(wordcloud)
        plt.axis('off')
        plt.show()
    
        # 保存图片
        wordcloud.to_file('test.jpg')
    
    englishChars = ['a','b']
    def guolv(data):
    
        data1 = data.strip(u'!').strip(u',').strip(u'。').strip(u'【').strip(u'】').strip(u'“').strip(u'《').strip(u'》')
        data1 = data1.strip(u':').strip(u'、').strip(u'”').strip(u'?')
    
        data1 = data1.strip(',')
        data1 = data1.strip('.')
        data1 = data1.strip('?')
        data1 = data1.strip('!')
        data1 = data1.strip('\'')
        data1 = data1.strip('"')
        data1 = data1.strip('!')
        data1 = data1.strip('{')
        data1 = data1.strip('}')
    
        #手动过滤
        data1 = data1.strip(u'初始化')
        data1 = data1.strip(u'源代码')
        data1 = data1.strip(u'文件夹')
        data1 = data1.strip(u'应用程序')
        data1 = data1.strip(u'编译器')
        data1 = data1.strip(u'比如说')
        data1 = data1.strip(u'构造函数')
        data1 = data1.strip(u'对话框')
        data1 = data1.strip(u'为什么')
        data1 = data1.strip(u'接下来')
        data1 = data1.strip(u'也就是')
    
        pattern = re.compile(r'([a-zA-Z_-]+)|([0-9_-]+)', re.S)
        data1 = re.sub(pattern, '', data1)
    
        if len(data1) < 3:
            return ''
    
        return data1
    
    
    def cipinCalc(text):
        cutDict = dict()
        cutList = []
    
        # jieba.load_userdict('dict.txt')
        seg = jieba.cut(text)
    
        cutList.append(list(seg))
    
        for c in cutList:
            for c1 in c:
                # print(c1)
    
                # 过滤标点符号
                c1 = guolv(c1)
                if c1.strip() == '':
                    continue
    
                if c1 in cutDict:
                    cutDict[c1] += 1
                else:
                    cutDict[c1] = 1
    
        return cutDict
    
    def read_csv(fileName):
        file = pandas.read_csv(fileName, header=None, encoding='utf_8_sig')
        return file.values[:]
    
    if __name__ == '__main__':
    
        data = ''
        cutList = []
        cutDict = dict()
    
        with codecs.open('r.txt', 'r', encoding='gbk') as f:
            data = f.read()
    
        cutDict = cipinCalc(data)
    
        analyCiYun(cutDict, bg='bg.png')
    
    

    效果

    0_1526449874831_test.jpg

    分析

    文字越大,说明文章中出现词频越高
    可以看到很多文章中涉及坐标系,开发者,摄像机,跨平台的描述



  • 为你点一个赞!这都会融会贯通了,看来做Python的爬网站真的很有意思。


 

走马观花

最近的回复

关注我们

微博
QQ群











召唤伊斯特瓦尔