本文来自网易云社区
作者:王贝
小学生现在都在学python了,作为专业程序员当然不能落下了,所以,快马加鞭,周六周末在家学起了python3,python3的基本语法比较简单,相比于Java开发更加敏捷,python3的基础就不讲了,这里主要讲下我这里的爬虫小程序的实现逻辑吧
with requests.get(url,params={},headers={}) as rsp:
res.text #返回值文本内容
with requests.post(url,json={},headers={}) as rsp:
res.text #返回值文本内容
soup=BeautifulSoup(html,'html.parser')
atags=soup.find('div',{'id':'u1'}).findChilren('a',{'class':'mnav'})
values=[]
for atag in atags:
values.append(atag.text)
<html>
<head>
<meta http-equiv=content-type content=text/html;charset=utf-8>
<meta http-equiv=X-UA-Compatible content=IE=Edge>
<meta content=always name=referrer>
<link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css>
<title>百度一下,你就知道</title></head>
<body link=#0000cc>
<div id=wrapper>
<div id=head>
<div > <div > <div > <div id=lg><img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129></div>
<form id=form name=f action=//www.baidu.com/s > <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden
name=rsv_bp
value=1>
<input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span
><input id=kw name=wd > autocomplete=off autofocus></span><span
><input type=submit id=su value=百度一下 ></span></form>
</div>
</div>
<div id=u1><a href=http://news.baidu.com name=tj_trnews > name=tj_trhao123 > <a href=http://map.baidu.com name=tj_trmap > > href=http://tieba.baidu.com name=tj_trtieba > <noscript><a
href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1
name=tj_login > <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=' + encodeURIComponent(window.location.href + (window.location.search === "" ? "?" : "&") + "bdorz_come=1") + '" name="tj_login" >登录</a>');</script>
<a href=//www.baidu.com/more/ name=tj_briicon > </div>
</div>
<div id=ftCon>
<div id=ftConw><p id=lh><a href=http://home.baidu.com>关于百度</a> <a href=http://ir.baidu.com>About Baidu</a></p>
<p id=cp>©2017 Baidu <a href=http://www.baidu.com/duty/>使用百度前必读</a> <a
href=http://jianyi.baidu.com/ > src=//www.baidu.com/img/gs.gif></p></div>
</div>
</div>
</body>
</html>
CREATE TABLE `youku_banner` (
`id` bigint(22) NOT NULL AUTO_INCREMENT,
`type` int(2) NOT NULL, #优酷banner类型 1:电视 2:电影 3.综艺
`year` int(4) NOT NULL,
`month` int(2) NOT NULL,
`date` int(2) NOT NULL,
`hour` int(2) NOT NULL,
`minute` int(2) NOT NULL,
`img` varchar(255) DEFAULT NULL,
`title` varchar(255) DEFAULT NULL,
`url` varchar(255) DEFAULT NULL,
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `idx_uniq` (`year`,`month`,`date`,`hour`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=83 DEFAULT CHARSET=utf8mb4
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
dburl = 'mysql+pymysql://root:123@localhost/youku?charset=utf8'
#pool_size 置为100 session回收时间3600s
ds = create_engine(dburl,pool_size=100,pool_recycle=3600)
Session = sessionmaker(bind=ds)
# session=Session()
#创建session管理类
class SessionManager():
def __init__(self):
self.session=Session()
def __enter__(self):
return self.session
#连接池管理session,不需要显示close
def __exit__(self, exc_type, exc_val, exc_tb):
# session.close()
print('not close')
youkubannerdao.py
from sqlalchemy import Sequence, Column, Integer, BigInteger, String, TIMESTAMP, text
from sqlalchemy.ext.declarative import declarative_base
from youku_any.datasource import SessionManager
Base = declarative_base()
#继承基类Base
class YoukuBanner(Base):
#指定表名
__tablename__ = 'youku_banner'
#定义字段映射关系
id = Column(BigInteger, Sequence('id'), primary_key=True)
type=Column(Integer)
year = Column(Integer)
month = Column(Integer)
date = Column(Integer)
hour = Column(Integer)
minute = Column(Integer)
img = Column(String(255))
title = Column(String(255))
url = Column(String(255))
createTime = Column('create_time', TIMESTAMP)
def add(self):
#with as 先执行SessionManager __enter__() 逻辑行结束执行__exit()__
with SessionManager() as session:
try:
session.add(self)
session.commit()
except:
session.rollback()
def addBatch(self,values):
with SessionManager() as session:
try:
session.add_all(values)
session.commit()
except:
session.rollback()
def select(self,param):
with SessionManager() as session:
return session.query(YoukuBanner).select_from(YoukuBanner).filter(param)
def remove(self,parma):
with SessionManager() as session:
try:
session.query(YoukuBanner).filter(parma).delete(synchronize_session='fetch')
session.commit()
except:
session.rollback()
def update(self,param,values):
with SessionManager() as session:
try:
session.query(YoukuBanner).filter(param).update(values, synchronize_session='fetch')
session.commit()
except:
session.rollback()
import requests
import json
import re
import datetime
from bs4 import BeautifulSoup
from sqlalchemy import text
from youku_any.youkubannerdao import YoukuBanner
def getsoup(url):
with requests.get(url, params=None, headers=None) as req:
if req.encoding != 'utf-8':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encode = encodings[0]
else:
encode = req.apparent_encoding
encode_content = req.content.decode(encode).encode('utf-8')
soup = BeautifulSoup(encode_content, 'html.parser')
return soup
def getbanner(soup):
# soup = BeautifulSoup()
# soup.findChild()
bannerDivP = soup.find('div', {'id': 'm_86804', 'name': 'm_pos'})
bannerScript = bannerDivP.findChildren('script', {'type': 'text/javascript'})[1].text
m = re.search('\[.*\]', bannerScript)
banners = json.loads(m.group())
for banner in banners:
time = datetime.datetime.now()
youkubanner = YoukuBanner(type=1, year=time.year, month=time.month, date=time.day, hour=time.hour,
minute=time.minute,
img=banner['img'], title=banner['title'], url=banner['url'])
youkubanner.add()
soup=getsoup('http://tv.youku.com/')
getbanner(soup)
youkuBanner = YoukuBanner()
youkuBanner.remove(parma=text('id=67 or id=71'))
youkuBanner.update(param=text('id=70'),values={'title':YoukuBanner.title + '呼啸山庄'})
for i in range(0,10000):
youkuBanner.update(param=text('id=70'), values={'title': YoukuBanner.title + '呼啸山庄'})
bannerList = youkuBanner.select(param=text('id > 66 and id < 77 order by id asc limit 0,7'))
print("lines--------%d" % i)
# time.sleep(10)
for banner in bannerList:
print(banner.id,banner.minute,banner.img,banner.title)
到此,一个简答的爬虫脚本就写完了,周末两天的成果还是有点小满足,不过这只是python的冰山一脚,还有好多等着我们去探讨呢。
网易云免费体验馆,0成本体验20+款云产品!
更多网易研发、产品、运营经验分享请访问网易云社区。