将数据保存为CSV形式存储
将数据保存为CSV形式存储
可以使用pandas.Dataframe.to_csv()函数来储存数据为csv文件。
以下展示为自定义函数实现数据存储为csv文件: 函数标注已经写在代码中: 代码示例:
import numpy as np import sklearn import pandas as pd import os import sys import tensorflow as tf from tensorflow import keras
from sklearn.datasets import fetch_california_housing #从sklearn中引用加州的房价数据 housing = fetch_california_housing() print(housing.DESCR) print(housing.data.shape) print(housing.target.shape)
#引用train_test_split对数据集进行拆分 # test_size 控制切分比例,默认切分比例3:1 from sklearn.model_selection import train_test_split #拆分数据集,加载数据集后返回训练集以及测试集 x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state = 1) #将训练集进行一次拆分为验证集和测试集 x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=2) print(x_train.shape, y_train.shape) print(x_valid.shape, y_valid.shape) print(x_test.shape, y_test.shape)
(11610, 8) (11610,) (3870, 8) (3870,) (5160, 8) (5160,)
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() #对数据进行归一化处理 #由于transform处理处理数据时二维数组,所以要将数据转化一下 #x_train: [none, 28, 28] -> [none, 784] #对于使用fit_transform 和transform 请参考我的TensorFlow中的博客 x_train_scaled = scaler.fit_transform(x_train) x_valid_scaled = scaler.transform(x_valid) x_test_scaled = scaler.transform(x_test)
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() #对数据进行归一化处理 #由于transform处理处理数据时二维数组,所以要将数据转化一下 #x_train: [none, 28, 28] -> [none, 784] #对于使用fit_transform 和transform 请参考我的TensorFlow中的博客 x_train_scaled = scaler.fit_transform(x_train) x_valid_scaled = scaler.transform(x_valid) x_test_scaled = scaler.transform(x_test)
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() #对数据进行归一化处理 #由于transform处理处理数据时二维数组,所以要将数据转化一下 #x_train: [none, 28, 28] -> [none, 784] #对于使用fit_transform 和transform 请参考我的TensorFlow中的博客 x_train_scaled = scaler.fit_transform(x_train) x_valid_scaled = scaler.transform(x_valid) x_test_scaled = scaler.transform(x_test)
# np.c_[] : 按照列连接字符串 train_data = np.c_[x_train_scaled, y_train] valid_data = np.c_[x_valid_scaled, y_valid] test_data = np.c_[x_test_scaled, y_test] #将添加的新的一列Y值添加一个列名 header_cols = housing.feature_names + ["MidanHouseValue"] header_str = ",".join(header_cols) #生成csv文件 train_filenames = save_to_csv(output_dir, train_data, "train", header_str, n_parts=20) valid_filenames = save_to_csv(output_dir, valid_data, "valid", header_str, n_parts=10) test_filenames = save_to_csv(output_dir, test_data, "test", header_str, n_parts=10)