Python batch download code of Sina blog

From , 3 Years ago, written in Python, viewed 207 times.
URL https://pastebin.vip/view/151de84c
  1. # coding=utf-8
  2. import urllib2
  3. import sys, os
  4. import re
  5. import string
  6. from BeautifulSoup import BeautifulSoup
  7.  
  8. def encode(s):
  9.     return s.decode('utf-8').encode(sys.stdout.encoding, 'ignore')
  10.  
  11. def getHTML(url):
  12.     #proxy_handler = urllib2.ProxyHandler({'http':'http://211.138.124.211:80'})
  13.     #opener = urllib2.build_opener(proxy_handler)
  14.     #urllib2.install_opener(opener)
  15.     req = urllib2.Request(url)
  16.     response = urllib2.urlopen(req, timeout=15)
  17.  
  18.     return BeautifulSoup(response, convertEntities=BeautifulSoup.HTML_ENTITIES)
  19.  
  20. def visible(element):
  21.     '''抓取可见的文本元素'''
  22.     if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
  23.         return False
  24.     elif re.match('<!--.*-->', str(element)):
  25.         return False
  26.     elif element == u'\xa0':
  27.         return False
  28.  
  29.     return True
  30.  
  31. def delReturn(element):
  32.     '''删除元素内的换行'''
  33.     return re.sub('(?<!^)\n+(?!$)', ' ', str(element)).decode('utf-8')
  34.  
  35. def validFilename(filename):
  36.     # windows
  37.     return re.sub('[\/:*?<>"|\xa0]', '', filename)
  38.  
  39. def writeToFile(text, filename, dirname):
  40.     if not os.path.exists(dirname):
  41.         os.makedirs(dirname)
  42.         print encode('保存到目录'), dirname
  43.  
  44.     filename = validFilename(filename)
  45.     print encode('保存文章'), filename
  46.  
  47.     path = os.path.join(dirname, filename)
  48.     if not os.path.exists(path):
  49.         f = open(path, 'w')
  50.         f.write(text)
  51.         f.close()
  52.     else:
  53.         print filename, encode('已经存在')
  54.  
  55. def formatContent(url, title=''):
  56.     '''格式化文章内容'''
  57.     page = getHTML(url)
  58.  
  59.     content = page.find('div', {'class':'articalContent'})
  60.     art_id = re.search('blog_(\w+)\.html', url).group(1)
  61.     blog_name = page.find('span', id='blognamespan').string
  62.  
  63.     if title == '':
  64.         title = page.find('h2', id=re.compile('^t_')).string
  65.  
  66.     temp_data = filter(visible, content.findAll(text=True)) # 去掉不可见元素
  67.     temp_data = ''.join(map(delReturn, temp_data)) # 删除元素内的换行符
  68.     temp_data = temp_data.strip() # 删除文章首尾的空行
  69.     temp_data = re.sub('\n{2,}', '\n\n', temp_data) # 删除文章内过多的空行
  70.  
  71.     # 输出到文件
  72.     # 编码问题
  73.     temp_data = '本文地址:'.decode('utf-8') + url + '\n\n' + temp_data
  74.     op_text = temp_data.encode('utf-8')
  75.     op_file = title + '_' + art_id +'.txt'
  76.  
  77.     writeToFile(op_text, op_file, blog_name)
  78.  
  79. def articlelist(url):
  80.     articles = {}
  81.  
  82.     page = getHTML(url)
  83.     pages = page.find('ul', {'class':'SG_pages'}).span.string
  84.     page_num = int(re.search('(\d+)', pages).group(1))
  85.  
  86.     for i in range(1, page_num+1):
  87.         print encode('生成第%d页文章索引'%i)
  88.         if i != 1:
  89.             url = re.sub('(_)\d+(\.html)$', '\g<1>'+str(i)+'\g<2>', url)
  90.             page = getHTML(url)
  91.  
  92.         article = page.findAll('span', {'class':'atc_title'})
  93.  
  94.         for art in article:
  95.             art_title = art.a['title']
  96.             art_href = art.a['href']
  97.             articles[art_title] = art_href
  98.  
  99.     return articles
  100.  
  101. def blog_dld(articles):
  102.     if not isinstance(articles, dict):
  103.         return False
  104.  
  105.     print encode('开始下载文章')
  106.     for art_title, art_href in articles.items():
  107.         formatContent(art_href, art_title)
  108.  
  109. if __name__ == '__main__':
  110.     sel = raw_input(encode('你要下载的是(1)全部文章还是(2)单篇文章,输入1或者2: '))
  111.  
  112.     if sel == '1':
  113.         #articlelist_url = 'http://blog.sina.com.cn/s/articlelist_1303481411_0_1.html'
  114.         articlelist_url = raw_input(encode('请输入博客文章目录链接: '))
  115.         articles = articlelist(articlelist_url)
  116.         blog_dld(articles)
  117.     else:
  118.         #article_url = 'http://blog.sina.com.cn/s/blog_4db18c430100gxc5.html'
  119.         article_url = raw_input(encode('请输入博客文章链接: '))
  120.         formatContent(article_url)
  121. #//python/5319

Reply to "Python batch download code of Sina blog"

Here you can reply to the paste above

captcha

https://burned.cc - Burn After Reading Website