I have this df whose features I'd like to train, test and validate, using cross-validation:
RangeIndex: 370 entries, 0 to 369 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 round 370 non-null int64 <---- will be dropped 1 home_team_goal 370 non-null int64 <---- will be dropped 2 away_team_goal 370 non-null int64 <---- will be dropped 3 home_best_attack 370 non-null float64 4 home_best_defense 370 non-null float64 5 home_avg_attack 370 non-null float64 6 home_avg_defense 370 non-null float64 7 home_std_attack 370 non-null float64 8 home_std_defense 370 non-null float64 9 gk_home_player_1 370 non-null float64 10 away_avg_attack 370 non-null float64 11 away_avg_defense 370 non-null float64 12 away_std_attack 370 non-null float64 13 away_std_defense 370 non-null float64 14 away_best_attack 370 non-null float64 15 away_best_defense 370 non-null float64 16 gk_away_player_1 370 non-null float64 dtypes: float64(14), int64(3) Dataset
My dataset, however, must include some added topological features (which captures diagrams and geometrical relationships between the original set of features above).
This is how I need my dataset to end up like, in a pandas dataframe:
Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 home_best_attack 2565 non-null float64 1 home_best_defense 2565 non-null float64 2 home_avg_attack 2565 non-null float64 3 home_avg_defense 2565 non-null float64 4 home_std_attack 2565 non-null float64 5 home_std_defense 2565 non-null float64 6 gk_home_player_1 2565 non-null float64 7 away_avg_attack 2565 non-null float64 8 away_avg_defense 2565 non-null float64 9 away_std_attack 2565 non-null float64 10 away_std_defense 2565 non-null float64 11 away_best_attack 2565 non-null float64 12 away_best_defense 2565 non-null float64 13 gk_away_player_1 2565 non-null float64 14 bottleneck_metric 2565 non-null float64 <---- will be added 15 wasserstein_metric 2565 non-null float64 <---- will be added 16 landscape_metric 2565 non-null float64 <---- will be added 17 betti_metric 2565 non-null float64 <---- will be added 18 heat_metric 2565 non-null float64 <---- will be added 19 label 2565 non-null float64 <---- will be added Test Feature Extraction
I have the following methods for extracting those added features from the original dataframe:
def extract_topological_features(diagrams): metrics = ['bottleneck', 'wasserstein', 'landscape', 'betti', 'heat'] new_features = [] for metric in metrics: amplitude = Amplitude(metric=metric) new_features.append(amplitude.fit_transform(diagrams)) new_features = np.concatenate(new_features, axis=1) return new_features def extract_features_for_fantasy_prediction(x_train, y_train, x_test, y_test, pipeline): shift = 10 top_features = [] # run as main all_x_train = x_train[:, :14] all_y_train = y_train for i in tqdm(range(0, len(x_test), shift)): # print(range(0, len(x_test), shift)) if i+shift > len(x_test): shift = len(x_test) - i batch = np.concatenate([all_x_train, x_test[i: i + shift]]) batch_y = np.concatenate([all_y_train, y_test[i: i + shift].reshape((-1,))]) diagrams_batch, _ = pipeline.fit_transform_resample(batch, batch_y) new_features_batch = extract_topological_features(diagrams_batch[-shift:]) top_features.append(new_features_batch) all_x_train = np.concatenate([all_x_train, batch[-shift:]]) all_y_train = np.concatenate([all_y_train, batch_y[-shift:]]) final_x_test = np.concatenate([x_test, np.concatenate(top_features, axis=0)], axis=1) return final_x_test Cross Validation
and this is my code for doing so:
def cross_validate(self, full_x, full_y, splitting_dates): train_split_date = splitting_dates[0] val_split_date = splitting_dates[1] end_date = splitting_dates[2] train_x = full_x[(full_x['round'] > train_split_date) | (full_x['round'] <= end_date)] train_y = full_y[(full_x['round'] > train_split_date) | (full_x['round'] <= end_date)] val_x = full_x[(full_x['round'] >= train_split_date) & (full_x['round'] < val_split_date)] val_y = full_y[(full_x['round'] >= train_split_date) & (full_x['round'] < val_split_date)] test_x = full_x[(full_x['round'] >= val_split_date) & (full_x['round'] < end_date)] test_y = full_y[(full_x['round'] >= val_split_date) & (full_x['round'] < end_date)] train_x.pop("round") val_x.pop("round") test_x.pop("round") train_x = train_x.values train_y = train_y.values val_x = val_x.values val_y = val_y.values test_x = test_x.values test_y = test_y.values print("START VALIDATING MODEL") models_cv = self._validate_k_fold_model(train_x, train_y, val_x, val_y) best_model_params = best_combination(models_cv) best_model_params.pop("score") best_model = RandomForestClassifier(**best_model_params) best_model.fit(train_x, train_y) score = best_model.score(test_x, test_y) print(f'score no_top {score}') print(f'best model parameters no_top {best_model_params}') print("START VALIDATING PARAMS") topo_cv = self._validate_k_fold_top(best_model, train_x, train_y, val_x, val_y) best_topo = best_combination(topo_cv) best_topo.pop("score") best_topo_pipeline_list = [('extract_subspaces', SubSpaceExtraction(**best_topo)), ('compute_diagrams', VietorisRipsPersistence(n_jobs=-1))] best_topo_pipeline = Pipeline(best_topo_pipeline_list) train_x_for_test = np.concatenate([train_x, val_x], axis=0) train_y_for_test = np.concatenate([train_y, val_y], axis=0) diagrams_train, _ = best_topo_pipeline.fit_transform_resample(train_x_for_test, train_y_for_test) print("EXTRACTING TOPOLOGICAL FEATURES TRAIN") top_features_train = extract_topological_features(diagrams_train) x_train_model = np.concatenate([train_x_for_test, top_features_train], axis=1) best_model.fit(x_train_model, train_y_for_test) print("EXTRACTING TOPOLOGICAL FEATURES TEST") x_test_model = extract_features_for_fantasy_prediction(x_train_model, train_y_for_test, test_x, test_y, best_topo_pipeline) score_top = best_model.score(x_test_model, test_y) val_x_with_topo = extract_features_for_fantasy_prediction(train_x, train_y, val_x, val_y, best_topo_pipeline) print('START VALIDATING MODEL WITH OPTIMAL TOPOLOGY') model_config_with_topo = self._validate_k_fold_model(x_train_model[:train_x.shape[0]], train_y, val_x_with_topo, val_y) best_model_config_with_topo = best_combination(model_config_with_topo) best_model_config_with_topo.pop('score') best_model_with_topo = RandomForestClassifier(**best_model_config_with_topo) best_model_with_topo.fit(x_train_model, train_y_for_test) score_best_topo_and_model = best_model_with_topo.score(x_test_model, test_y) print(f'score best model and topo_feat {score_best_topo_and_model}') return best_model_params, best_topo, best_model_config_with_topo, score, score_top, score_best_topo_and_model QUESTION:
Running the following code...
y = compute_match_result(df) df.pop('home_team_goal') df.pop('away_team_goal') cv = CrossValidation(k_mins=k_mins, k_maxs=k_maxs, dist_percentages=distances, **model_params) cv_output = cv.cross_validate(df, y, (train_split_date, val_split_date, end_date)) ...and since most of features are being computed as arrays, at precisely which point of cross_validation() above (and HOW) can I add the topological features to the original df, ending up with the desired dataset (datarame) above?
Note: full code can be found here.
https://stackoverflow.com/questions/66432188/extract-topological-features-from-dataframe-an-add-them-to-a-new-expanded-dataf March 02, 2021 at 09:57AM
没有评论:
发表评论