Python extracts a list of URLs within a page

From , 4 Years ago, written in Python, viewed 54 times.
URL https://pastebin.vip/view/cf5530d9
  1. from bs4 import BeautifulSoup
  2. import time,re,urllib2
  3.  
  4. t=time.time()
  5.  
  6. websiteurls={}
  7.  
  8. def scanpage(url):
  9.    
  10.     websiteurl=url
  11.     t=time.time()
  12.     n=0
  13.     html=urllib2.urlopen(websiteurl).read()
  14.     soup=BeautifulSoup(html)
  15.     pageurls=[]
  16.     Upageurls={}
  17.     pageurls=soup.find_all("a",href=True)
  18.  
  19.     for links in pageurls:
  20.         if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls:
  21.             Upageurls[links.get("href")]=0
  22.     for links in Upageurls.keys():
  23.         try:
  24.             urllib2.urlopen(links).getcode()
  25.         except:
  26.             print "connect failed"
  27.         else:
  28.             t2=time.time()
  29.             Upageurls[links]=urllib2.urlopen(links).getcode()
  30.             print n,
  31.             print links,
  32.             print Upageurls[links]
  33.             t1=time.time()
  34.             print t1-t2
  35.         n+=1
  36.     print ("total is "+repr(n)+" links")
  37.     print time.time()-t
  38.  
  39.  
  40. scanpage("http://news.163.com/")
  41.  
  42. #//python/5773

Reply to "Python extracts a list of URLs within a page"

Here you can reply to the paste above

captcha

https://burned.cc - Burn After Reading Website