html = driver.find_element_by_xpath("//*").get_attribute("outerHTML")# 不要用 driver.page_source,那样得到的页面源码不标准
3.爬取网页中所有URL链接
# coding=utf-8 import re import urllib url = "http://www.csdn.net/" content = urllib.urlopen(url).read() urls = re.findall(r"<a.*?href=.*?</a>", content, re.I) for url in urls: print unicode(url,'utf-8') link_list = re.findall(r"(?<=href=").+?(?=")|(?<=href=').+?(?=')", content) for url in link_list: print(url)
4.抓取title标签间的内容
url = "http://www.baidu.com/" content = urllib.urlopen(url).read() title = re.findall(r'<title>(.*?)</title>', content) print(title[0]) pat = r'(?<=<title>).*?(?=</title>)' ex = re.compile(pat, re.M|re.S) obj = re.search(ex, content) title = obj.group() print(title)