logo资料库

携程航班信息爬取(python)—第一次写博客,不好请别见外!.pdf

第1页 / 共5页
第2页 / 共5页
第3页 / 共5页
第4页 / 共5页
第5页 / 共5页
资料共5页,全文预览结束
携程航班信息爬取(python)—第一次写博客,不好请别见外! 携程航班信息爬取 第一次写博客,不好请别见外! 1.航班信息接口 航班信息接口 api=“https://flights.ctrip.com/itinerary/api/12808/products”,这个接口中包含了所要查询的航班信息。 *这是接口中所展现的内容,包含所查询到的航班信息,在routeList中: 1.获取城市的英文缩写 获取城市的英文缩写 因为在post提交的时候,需要在DataFrame中加入城市的英文字母,如:
所以我们必须得到这些城市的字母缩写,根据接口api=“https://flights.ctrip.com/itinerary/api/poi/get”,其中有城市的缩写信息,如下: 所以请求这个接口就可以得到我们想要的信息了,通过正则表达式,把需要的信息提取出来,做成一个字典,我们就完成了,需要的时候,就用dict.get()方法就可以取出我们想要的 城市的缩写,但是在这之中某几个城市没有,所以只能手动添加。 首先请求api:https://flights.ctrip.com/itinerary/api/poi/get def request(): # 固定提供的 api 接口 url = "https://flights.ctrip.com/itinerary/api/poi/get" # 根据自己的请求头修改 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0", "Referer": "https://flights.ctrip.com/itinerary", "Content-Type": "application/json" } # get请求 response = requests.get(url, headers=headers).text return response 获得城市字典的代码如下: def get_city_and_index(content): data1 = content['data']['ABCDEF'] data2 = content['data']['GHIJ'] data3 = content['data']['KLMN'] data4 = content['data']['PQRSTUVW'] data5 = content['data']['XYZ'] city_and_index = {} for x in [data1, data2, data3, data4, data5]: for k, y in x.items(): for z in y: data = re.search('\((.*)\)', z['data']) data = re.sub('\(||\)', '', data.group(0)) city_and_index[z['display']] = data return city_and_index
逻辑为下: response=request() # print(response) content = json.loads(response) # 获取城市的英文缩写 city_and_index = get_city_and_index(content) 2.POST提交提交 得到需要的城市字典后,我们就可以进行POST提交了, 参数city是所得到的字典,dcity是开始城市,acity是到达的城市,date为出行日期。这个函数中,还包括mean()函数和getInfo()函数,mean()函数是美化输出,getInfo()函数是从json 类型的legs中得到具体的航班信息: def mean(): print("-"*100) print("航班号\t"+"航空公司\t"+"飞机型号\t"+"类型\t"+"起飞地点\t"+"降落地点\t"+"起飞时间\t"+"到达时间\t" +"精准率\t") print("-"*100) def getInfo(leg): flight = leg['flight'] # print(flight) flightNumber = flight['flightNumber'] # 航班号 airlineName = flight['airlineName'] # 航空公司 craftTypeName = flight['craftTypeName'] # 飞机类型 craftTypeKindDisplayName = flight['craftTypeKindDisplayName'] # 飞机类型 start = flight['departureAirportInfo']['airportName'] # 飞机乘坐地点 end = flight['arrivalAirportInfo']['airportName'] # 降落机场 departureDate = flight['departureDate'] arrivalDate = flight['arrivalDate'] punctualityRate = flight['punctualityRate'] # 精准率 print(flightNumber + "\t" + airlineName + "\t" + craftTypeName + "\t" + craftTypeKindDisplayName + "\t" + start + "\t" + end + "\t" + departureDate + "\t" + arrivalDate + "\t" + punctualityRate) 完整的search()函数: def search(city, dcity, acity, date): # url='https://flights.ctrip.com/itinerary/api/12808/products' url = 'https://flights.ctrip.com/itinerary/api/12808/products' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", "Content-Type": "application/json", # 声明文本类型为 json 格式 "referer": "https://flights.ctrip.com/itinerary/roundtrip/bjs-sha?date=2020-03-02,2020-03-03", "cookie": '_abtest_userid=9a22401e-030f-4503-b20d-6696205a68ed; _RSG=cP4H2lOeBh4Q5LSFnqjOWB; _RDG=28258b145e36812c43286457fb56969f7d; _RGUID=4acba18b- c244-4666-b818-b74c4f71515a; _ga=GA1.2.67711177.1574923739; MKT_CKID=1582371932248.xh3tu.f3q7; MKT_Pagesource=PC; DomesticUserHostCity=BZX|%b0%cd%d6%d0; _gid=GA1.2.861767137.1582695241; _RF1=171.208.25.215; FlightIntl=Search= [%22BZX|%E5%B7%B4%E4%B8%AD(BZX)|3966|BZX|480%22%2C%22TYO|%E4%B8%9C%E4%BA%AC(TYO)|228|TYO|540%22%2C%222020-02-27%22%2C%222020-03- 01%22]; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1582800701&Expires=1583405501197; gad_city=db071d1cf7dabf738c20a3f37592a919; MKT_CKID_LMT=1582800704719; appFloatCnt=26; FD_SearchHistorty={"type":"D","data":"D%24%u5317%u4EAC%28BJS%29%24BJS%242020-03- 02%24%u4E0A%u6D77%28SHA%29%24SHA%242020-03-03"}; _bfa=1.1574923736071.3zbuu9.1.1582722830548.1582800698761.14.167; _bfs=1.3; _jzqco=%7C%7C%7C%7C1582800704982%7C1.949239734.1574923738750.1582800704716.1582800712660.1582800704716.1582800712660.undefined.0.0.105.105; __zpspc=9.17.1582800704.1582800712.2%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D10320673304%26p2%3D101023%26v1%3D167%26v2%3D166' } data_frame = { "flightWay": "Oneway", "classType": "ALL", "hasChild": False, "hasBaby": False, "searchIndex": 1, "airportParams": [ {"dcity": city.get(dcity), "acity": city.get(acity), "dcityname": dcity, "acityname": acity, "date": date }], 'token': "db3430b9644d192713fe428890dff5b2", } response = requests.post(url, data=json.dumps(data_frame), headers=headers).text # print(response) routeList = json.loads(response).get('data').get('routeList') # print("routeList:" + routeList) if routeList != None: mean() for route in routeList: routeType=route['routeType'] if routeType=='Flight': legs=route['legs'] for leg in legs: # print(leg) getInfo(leg) else: print(dcity + "----->" + acity + "无直达") 完整代码如下: 完整代码如下: import requests import re import json def request(): # 固定提供的 api 接口 url = "https://flights.ctrip.com/itinerary/api/poi/get" #请求头,可自行更改 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0", "Referer": "https://flights.ctrip.com/itinerary", "Content-Type": "application/json" } #发送get请求 response = requests.get(url, headers=headers).text return response def get_city_and_index(content): data1 = content['data']['ABCDEF'] data2 = content['data']['GHIJ'] data3 = content['data']['KLMN'] data4 = content['data']['PQRSTUVW'] data5 = content['data']['XYZ'] city_and_index = {} for x in [data1, data2, data3, data4, data5]: for k, y in x.items(): for z in y: data = re.search('\((.*)\)', z['data']) data = re.sub('\(||\)', '', data.group(0)) city_and_index[z['display']] = data return city_and_index def getInfo(leg): flight = leg['flight'] # print(flight) flightNumber = flight['flightNumber'] # 航班号
airlineName = flight['airlineName'] # 航空公司 craftTypeName = flight['craftTypeName'] # 飞机类型 craftTypeKindDisplayName = flight['craftTypeKindDisplayName'] # 飞机类型 start = flight['departureAirportInfo']['airportName'] # 飞机乘坐地点 end = flight['arrivalAirportInfo']['airportName'] # 降落机场 departureDate = flight['departureDate'] arrivalDate = flight['arrivalDate'] punctualityRate = flight['punctualityRate'] # 精准率 print(flightNumber + "\t" + airlineName + "\t" + craftTypeName + "\t" + craftTypeKindDisplayName + "\t" + start + "\t" + end + "\t" + departureDate + "\t" + arrivalDate + "\t" + punctualityRate) def mean(): print("-"*100) print("航班号\t"+"航空公司\t"+"飞机型号\t"+"类型\t"+"起飞地点\t"+"降落地点\t"+"起飞时间\t"+"到达时间\t" +"精准率\t") print("-"*100) def search(city, dcity, acity, date): # url='https://flights.ctrip.com/itinerary/api/12808/products' url = 'https://flights.ctrip.com/itinerary/api/12808/products' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", "Content-Type": "application/json", # 声明文本类型为 json 格式 "referer": "https://flights.ctrip.com/itinerary/roundtrip/bjs-sha?date=2020-03-02,2020-03-03", "cookie": '_abtest_userid=9a22401e-030f-4503-b20d-6696205a68ed; _RSG=cP4H2lOeBh4Q5LSFnqjOWB; _RDG=28258b145e36812c43286457fb56969f7d; _RGUID=4acba18b- c244-4666-b818-b74c4f71515a; _ga=GA1.2.67711177.1574923739; MKT_CKID=1582371932248.xh3tu.f3q7; MKT_Pagesource=PC; DomesticUserHostCity=BZX|%b0%cd%d6%d0; _gid=GA1.2.861767137.1582695241; _RF1=171.208.25.215; FlightIntl=Search= [%22BZX|%E5%B7%B4%E4%B8%AD(BZX)|3966|BZX|480%22%2C%22TYO|%E4%B8%9C%E4%BA%AC(TYO)|228|TYO|540%22%2C%222020-02-27%22%2C%222020-03- 01%22]; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1582800701&Expires=1583405501197; gad_city=db071d1cf7dabf738c20a3f37592a919; MKT_CKID_LMT=1582800704719; appFloatCnt=26; FD_SearchHistorty={"type":"D","data":"D%24%u5317%u4EAC%28BJS%29%24BJS%242020-03- 02%24%u4E0A%u6D77%28SHA%29%24SHA%242020-03-03"}; _bfa=1.1574923736071.3zbuu9.1.1582722830548.1582800698761.14.167; _bfs=1.3; _jzqco=%7C%7C%7C%7C1582800704982%7C1.949239734.1574923738750.1582800704716.1582800712660.1582800704716.1582800712660.undefined.0.0.105.105; __zpspc=9.17.1582800704.1582800712.2%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D10320673304%26p2%3D101023%26v1%3D167%26v2%3D166' } data_frame = { "flightWay": "Oneway", "classType": "ALL", "hasChild": False, "hasBaby": False, "searchIndex": 1, "airportParams": [ {"dcity": city.get(dcity), "acity": city.get(acity), "dcityname": dcity, "acityname": acity, "date": date }], 'token': "db3430b9644d192713fe428890dff5b2", } response = requests.post(url, data=json.dumps(data_frame), headers=headers).text # print(response) routeList = json.loads(response).get('data').get('routeList') # print("routeList:" + routeList) if routeList != None: mean() for route in routeList: routeType=route['routeType'] #只选择了有直达的 #Flight为直达 Flighttrain 为火车和飞机中转 Train为只是飞机中转 #在此可以扩展 if routeType=='Flight': legs=route['legs'] for leg in legs: # print(leg) getInfo(leg) else: print(dcity + "----->" + acity + "无直达") def main(): response=request() # print(response) content = json.loads(response) # 获取城市的英文缩写 city_and_index = get_city_and_index(content) # print(city_and_index) #以下为添加的城市信息,在city_and_index字典中不存在 city_and_index['荆州'] = 'SHS' city_and_index['惠州'] = 'HUZ' city_and_index['佛山'] = 'FUO' city_and_index['甘孜'] = 'GZG' start=input("请输入开始地址(如:北京):") end=input("请输入终点地址(如:上海):") time=input("请输入时间(格式:2020-03-02):") # start="上海" # end='西安' # time='2020-03-02' search(city_and_index,start,end,time) if __name__ == '__main__': main() 运行结果 运行结果
3.感谢感谢 首先谢谢你能够把这篇文章看完,如果有哪里不好或者不对的地方请指教,这是我第一次写博客,请各位大佬多多关照,谢谢!!! 作者:小小乖。
分享到:
收藏