logo资料库

对于上亿数据使用python进行分块处理.docx

第1页 / 共1页
资料共1页,全文预览结束
f = open('训练数据集2时间处理后.csv') reader = pd.read_csv(f, sep=',', iterator=True) loop = True chunkSize = 10000000 chunks = [] newdata =[] while loop: try: chunk = reader.get_chunk(chunkSize) chunks.append(chunk) except StopIteration: loop = False print("Iteration is stopped.") for s in chunks: dataset=s.values my_maxtrix=[] for k in dataset: aa=[] for j in k: aa.append(j) my_maxtrix.append(aa) line=len(my_maxtrix) v=1000 for i in range(int(line/v)): newdata.append(my_maxtrix[random.randrange(i*v,(i+1)*v,1)]) newdata.append(my_maxtrix[random.randrange(int(line/v)*v-1,line,1)]) with open("训练数据集1随机挑选.csv","w", newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(["ip","app","device","os","channel","click_time","is_attributed"]) writer.writerows(newdata) csvfile.close()
分享到:
收藏