携程航班信息爬取(python)—第一次写博客,不好请别见外!
携程航班信息爬取
第一次写博客,不好请别见外!
1.航班信息接口
航班信息接口
api=“https://flights.ctrip.com/itinerary/api/12808/products”,这个接口中包含了所要查询的航班信息。
*这是接口中所展现的内容,包含所查询到的航班信息,在routeList中:
1.获取城市的英文缩写
获取城市的英文缩写
因为在post提交的时候,需要在DataFrame中加入城市的英文字母,如:
所以我们必须得到这些城市的字母缩写,根据接口api=“https://flights.ctrip.com/itinerary/api/poi/get”,其中有城市的缩写信息,如下:
所以请求这个接口就可以得到我们想要的信息了,通过正则表达式,把需要的信息提取出来,做成一个字典,我们就完成了,需要的时候,就用dict.get()方法就可以取出我们想要的
城市的缩写,但是在这之中某几个城市没有,所以只能手动添加。
首先请求api:https://flights.ctrip.com/itinerary/api/poi/get
def request():
# 固定提供的 api 接口
url = "https://flights.ctrip.com/itinerary/api/poi/get"
# 根据自己的请求头修改
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
"Referer": "https://flights.ctrip.com/itinerary",
"Content-Type": "application/json"
}
# get请求
response = requests.get(url, headers=headers).text
return response
获得城市字典的代码如下:
def get_city_and_index(content):
data1 = content['data']['ABCDEF'] data2 = content['data']['GHIJ'] data3 = content['data']['KLMN'] data4 = content['data']['PQRSTUVW'] data5 = content['data']['XYZ'] city_and_index = {}
for x in [data1, data2, data3, data4, data5]:
for k, y in x.items():
for z in y:
data = re.search('\((.*)\)', z['data'])
data = re.sub('\(||\)', '', data.group(0))
city_and_index[z['display']] = data
return city_and_index
逻辑为下:
response=request()
# print(response)
content = json.loads(response)
# 获取城市的英文缩写
city_and_index = get_city_and_index(content)
2.POST提交提交
得到需要的城市字典后,我们就可以进行POST提交了,
参数city是所得到的字典,dcity是开始城市,acity是到达的城市,date为出行日期。这个函数中,还包括mean()函数和getInfo()函数,mean()函数是美化输出,getInfo()函数是从json
类型的legs中得到具体的航班信息:
def mean():
print("-"*100)
print("航班号\t"+"航空公司\t"+"飞机型号\t"+"类型\t"+"起飞地点\t"+"降落地点\t"+"起飞时间\t"+"到达时间\t"
+"精准率\t")
print("-"*100)
def getInfo(leg):
flight = leg['flight'] # print(flight)
flightNumber = flight['flightNumber'] # 航班号
airlineName = flight['airlineName'] # 航空公司
craftTypeName = flight['craftTypeName'] # 飞机类型
craftTypeKindDisplayName = flight['craftTypeKindDisplayName'] # 飞机类型
start = flight['departureAirportInfo']['airportName'] # 飞机乘坐地点
end = flight['arrivalAirportInfo']['airportName'] # 降落机场
departureDate = flight['departureDate'] arrivalDate = flight['arrivalDate'] punctualityRate = flight['punctualityRate'] # 精准率
print(flightNumber + "\t" + airlineName + "\t" + craftTypeName + "\t" + craftTypeKindDisplayName + "\t"
+ start + "\t" + end + "\t" + departureDate + "\t" + arrivalDate + "\t" + punctualityRate)
完整的search()函数:
def search(city, dcity, acity, date):
# url='https://flights.ctrip.com/itinerary/api/12808/products'
url = 'https://flights.ctrip.com/itinerary/api/12808/products'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Content-Type": "application/json", # 声明文本类型为 json 格式
"referer": "https://flights.ctrip.com/itinerary/roundtrip/bjs-sha?date=2020-03-02,2020-03-03",
"cookie": '_abtest_userid=9a22401e-030f-4503-b20d-6696205a68ed; _RSG=cP4H2lOeBh4Q5LSFnqjOWB; _RDG=28258b145e36812c43286457fb56969f7d; _RGUID=4acba18b-
c244-4666-b818-b74c4f71515a; _ga=GA1.2.67711177.1574923739; MKT_CKID=1582371932248.xh3tu.f3q7; MKT_Pagesource=PC;
DomesticUserHostCity=BZX|%b0%cd%d6%d0; _gid=GA1.2.861767137.1582695241; _RF1=171.208.25.215; FlightIntl=Search=
[%22BZX|%E5%B7%B4%E4%B8%AD(BZX)|3966|BZX|480%22%2C%22TYO|%E4%B8%9C%E4%BA%AC(TYO)|228|TYO|540%22%2C%222020-02-27%22%2C%222020-03-
01%22]; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=;
Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1582800701&Expires=1583405501197; gad_city=db071d1cf7dabf738c20a3f37592a919;
MKT_CKID_LMT=1582800704719; appFloatCnt=26; FD_SearchHistorty={"type":"D","data":"D%24%u5317%u4EAC%28BJS%29%24BJS%242020-03-
02%24%u4E0A%u6D77%28SHA%29%24SHA%242020-03-03"}; _bfa=1.1574923736071.3zbuu9.1.1582722830548.1582800698761.14.167; _bfs=1.3;
_jzqco=%7C%7C%7C%7C1582800704982%7C1.949239734.1574923738750.1582800704716.1582800712660.1582800704716.1582800712660.undefined.0.0.105.105;
__zpspc=9.17.1582800704.1582800712.2%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D10320673304%26p2%3D101023%26v1%3D167%26v2%3D166'
}
data_frame = {
"flightWay": "Oneway",
"classType": "ALL",
"hasChild": False,
"hasBaby": False,
"searchIndex": 1,
"airportParams": [
{"dcity": city.get(dcity), "acity": city.get(acity), "dcityname": dcity, "acityname": acity, "date": date
}],
'token': "db3430b9644d192713fe428890dff5b2",
}
response = requests.post(url, data=json.dumps(data_frame), headers=headers).text
# print(response)
routeList = json.loads(response).get('data').get('routeList')
# print("routeList:" + routeList)
if routeList != None:
mean()
for route in routeList:
routeType=route['routeType'] if routeType=='Flight':
legs=route['legs'] for leg in legs:
# print(leg)
getInfo(leg)
else:
print(dcity + "----->" + acity + "无直达")
完整代码如下:
完整代码如下:
import requests
import re
import json
def request():
# 固定提供的 api 接口
url = "https://flights.ctrip.com/itinerary/api/poi/get"
#请求头,可自行更改
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
"Referer": "https://flights.ctrip.com/itinerary",
"Content-Type": "application/json"
}
#发送get请求
response = requests.get(url, headers=headers).text
return response
def get_city_and_index(content):
data1 = content['data']['ABCDEF'] data2 = content['data']['GHIJ'] data3 = content['data']['KLMN'] data4 = content['data']['PQRSTUVW'] data5 = content['data']['XYZ'] city_and_index = {}
for x in [data1, data2, data3, data4, data5]:
for k, y in x.items():
for z in y:
data = re.search('\((.*)\)', z['data'])
data = re.sub('\(||\)', '', data.group(0))
city_and_index[z['display']] = data
return city_and_index
def getInfo(leg):
flight = leg['flight'] # print(flight)
flightNumber = flight['flightNumber'] # 航班号
airlineName = flight['airlineName'] # 航空公司
craftTypeName = flight['craftTypeName'] # 飞机类型
craftTypeKindDisplayName = flight['craftTypeKindDisplayName'] # 飞机类型
start = flight['departureAirportInfo']['airportName'] # 飞机乘坐地点
end = flight['arrivalAirportInfo']['airportName'] # 降落机场
departureDate = flight['departureDate'] arrivalDate = flight['arrivalDate'] punctualityRate = flight['punctualityRate'] # 精准率
print(flightNumber + "\t" + airlineName + "\t" + craftTypeName + "\t" + craftTypeKindDisplayName + "\t"
+ start + "\t" + end + "\t" + departureDate + "\t" + arrivalDate + "\t" + punctualityRate)
def mean():
print("-"*100)
print("航班号\t"+"航空公司\t"+"飞机型号\t"+"类型\t"+"起飞地点\t"+"降落地点\t"+"起飞时间\t"+"到达时间\t"
+"精准率\t")
print("-"*100)
def search(city, dcity, acity, date):
# url='https://flights.ctrip.com/itinerary/api/12808/products'
url = 'https://flights.ctrip.com/itinerary/api/12808/products'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Content-Type": "application/json", # 声明文本类型为 json 格式
"referer": "https://flights.ctrip.com/itinerary/roundtrip/bjs-sha?date=2020-03-02,2020-03-03",
"cookie": '_abtest_userid=9a22401e-030f-4503-b20d-6696205a68ed; _RSG=cP4H2lOeBh4Q5LSFnqjOWB; _RDG=28258b145e36812c43286457fb56969f7d; _RGUID=4acba18b-
c244-4666-b818-b74c4f71515a; _ga=GA1.2.67711177.1574923739; MKT_CKID=1582371932248.xh3tu.f3q7; MKT_Pagesource=PC;
DomesticUserHostCity=BZX|%b0%cd%d6%d0; _gid=GA1.2.861767137.1582695241; _RF1=171.208.25.215; FlightIntl=Search=
[%22BZX|%E5%B7%B4%E4%B8%AD(BZX)|3966|BZX|480%22%2C%22TYO|%E4%B8%9C%E4%BA%AC(TYO)|228|TYO|540%22%2C%222020-02-27%22%2C%222020-03-
01%22]; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=;
Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1582800701&Expires=1583405501197; gad_city=db071d1cf7dabf738c20a3f37592a919;
MKT_CKID_LMT=1582800704719; appFloatCnt=26; FD_SearchHistorty={"type":"D","data":"D%24%u5317%u4EAC%28BJS%29%24BJS%242020-03-
02%24%u4E0A%u6D77%28SHA%29%24SHA%242020-03-03"}; _bfa=1.1574923736071.3zbuu9.1.1582722830548.1582800698761.14.167; _bfs=1.3;
_jzqco=%7C%7C%7C%7C1582800704982%7C1.949239734.1574923738750.1582800704716.1582800712660.1582800704716.1582800712660.undefined.0.0.105.105;
__zpspc=9.17.1582800704.1582800712.2%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D10320673304%26p2%3D101023%26v1%3D167%26v2%3D166'
}
data_frame = {
"flightWay": "Oneway",
"classType": "ALL",
"hasChild": False,
"hasBaby": False,
"searchIndex": 1,
"airportParams": [
{"dcity": city.get(dcity), "acity": city.get(acity), "dcityname": dcity, "acityname": acity, "date": date
}],
'token': "db3430b9644d192713fe428890dff5b2",
}
response = requests.post(url, data=json.dumps(data_frame), headers=headers).text
# print(response)
routeList = json.loads(response).get('data').get('routeList')
# print("routeList:" + routeList)
if routeList != None:
mean()
for route in routeList:
routeType=route['routeType'] #只选择了有直达的
#Flight为直达 Flighttrain 为火车和飞机中转 Train为只是飞机中转
#在此可以扩展
if routeType=='Flight':
legs=route['legs'] for leg in legs:
# print(leg)
getInfo(leg)
else:
print(dcity + "----->" + acity + "无直达")
def main():
response=request()
# print(response)
content = json.loads(response)
# 获取城市的英文缩写
city_and_index = get_city_and_index(content)
# print(city_and_index)
#以下为添加的城市信息,在city_and_index字典中不存在
city_and_index['荆州'] = 'SHS'
city_and_index['惠州'] = 'HUZ'
city_and_index['佛山'] = 'FUO'
city_and_index['甘孜'] = 'GZG'
start=input("请输入开始地址(如:北京):")
end=input("请输入终点地址(如:上海):")
time=input("请输入时间(格式:2020-03-02):")
# start="上海"
# end='西安'
# time='2020-03-02'
search(city_and_index,start,end,time)
if __name__ == '__main__':
main()
运行结果
运行结果
3.感谢感谢
首先谢谢你能够把这篇文章看完,如果有哪里不好或者不对的地方请指教,这是我第一次写博客,请各位大佬多多关照,谢谢!!!
作者:小小乖。