Source code for tasks.numerai_train_and_predict

# -*- coding: utf-8 -*-
import os
from datetime import datetime

from numerapi.numerapi import NumerAPI
import luigi
import pandas as pd
from sklearn import metrics, preprocessing, linear_model

from .numerai_fetch_training_data import FetchAndExtractData


[docs]class TrainAndPredict(luigi.Task): """ Trains a naïve bayes classifier with an assumed bernoulli distribution of the features, then predicts the targets on the tournament data. The default signature of this task is ``TrainAndPredict(output_path='./data')``. :param: output_path (str): path to the directory where the predictions shall be saved to, defaults to ``./data``. """ output_path = luigi.Parameter(default='./data/')
[docs] def requires(self): """ Dependencies to be fullfiled prior to execution. This task needs the :py:class:`tasks.numerai_fetch_training_data.FetchAndExtractData` task that provides the training/tournament data. """
return FetchAndExtractData(output_path=self.output_path)
[docs] def output(self): """ Saves outputs of this task--which is a csv file of the predictions made for the given data. """ self.apc = NumerAPI() fn ='predictions_{0}_LogisticRegression.csv'.format(self.apc.get_current_round())
return luigi.LocalTarget(os.path.join(self.output_path, fn))
[docs] def run(self): """ Trains a model and makes predictions given the data. These are then saved to a csv file. """ data = self.input() out = self.output() training_data = pd.read_csv(data['training_data.csv'].path, header=0) prediction_data = pd.read_csv(data['tournament_data.csv'].path, header=0) # Transform the loaded CSV data into numpy arrays features = [f for f in list(training_data) if "feature" in f] X = training_data[features] Y = training_data["target"] x_prediction = prediction_data[features] ids = prediction_data["id"] # This is your model that will learn to predict model = linear_model.LogisticRegression(n_jobs=-1) # Your model is trained on the training_data model.fit(X, Y) # Your trained model is now used to make predictions on the # numerai_tournament_data # The model returns two columns: [probability of 0, probability of 1] # We are just interested in the probability that the target is 1. y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability': results}) joined = pd.DataFrame(ids).join(results_df) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv("predictions.csv", index=False) y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability': results}) joined = pd.DataFrame(ids).join(results_df) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file
joined.to_csv(out.path, index=False)