Source code for Astraea.Astraea

import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import os 

from astropy import units as u
from astropy.coordinates import SkyCoord
import astropy.utils as au
import astropy.coordinates as coord

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
#from sklearn.externals import joblib
import joblib
    
"""--------------------------------------------- start of function to download/load RF ---------------------------------------------"""
def download_RF_class(url='https://zenodo.org/record/3620729/files/RF_Class_model.sav?download=1'):
    os.system('wget '+url)
    os.system('mv RF_Class_model.sav?download=1 ./data/RF_Class_model.sav')
   
def download_RF_regr_1est(url='https://zenodo.org/record/3620729/files/RF_Regre_model_1est_flicker.sav?download=1'):
    os.system('wget '+url)
    os.system('mv RF_Regre_model_1est_flicker.sav?download=1 ./data/RF_Regre_model_1est_flicker.sav')

def download_RF_regr_100est(url='https://zenodo.org/record/3620729/files/RF_Regre_model_100est_flicker.sav?download=1'):
    os.system('wget '+url)
    os.system('mv RF_Regre_model_100est_flicker.sav?download=1 ./data/RF_Regre_model_100est_flicker.sav')


# load testing data
def load_KeplerTest():
    if not os.path.exists('./data/'):
        os.system('mkdir data')
    if not os.path.exists('./data/TestingData.pkl'):
        os.system('mkdir data')
        os.system('wget https://zenodo.org/record/3628778/files/TestingData.pkl?download=1')
        os.system('mv TestingData.pkl?download=1 ./data/TestingData.pkl')
    return pd.read_pickle('./data/TestingData.pkl')
    

[docs]def load_RF():
    """ Load random forest classifier and regressors from zendo.org. 
    
    Two regressors will be laoded, one with 1 estimator and one with 100 estimators. 1 estimator minimizes
    bias (systematic offset) with the cost of high varience (scattering) and 100 estimators minimizes varience and
    maximizes bias. If user wants to predict rotation period from Kepler light curves then it is best to use model 
    with 100 estimiators and user should use model with 1 estimator otherwise. Model trained on TESS light curves are
    still being developed. 
   
    
    """
    if not os.path.exists('./data/'):
        os.system('mkdir data')
    if not os.path.exists('./data/RF_Class_model.sav'):
        print('downloading classifier!')
        download_RF_class()
    if not os.path.exists('./data/RF_Regre_model_1est_flicker.sav'):
        print('downloading regressor with 1 estimator!')
        download_RF_regr_1est()
    if not os.path.exists('./data/RF_Regre_model_100est_flicker.sav'):
        print('downloading regressor with 100 estimators!')
        download_RF_regr_100est()
    return joblib.load('./data/RF_Class_model.sav'),joblib.load('./data/RF_Regre_model_100est_flicker.sav'),joblib.load('./data/RF_Regre_model_1est_flicker.sav')

"""--------------------------------------------- end of function to download/load RF ---------------------------------------------"""



"""--------------------------------------------- start of function to use trained RF to get Prot ---------------------------------------------"""
[docs]def FLICKERinstall():
    """ Installs the FLICKER software to calcualte Flicker values from light curves. Documentation: https://flicker.readthedocs.io.
    
    """
    os.system('pip install git+https://github.com/lyx12311/FLICKER.git')
    #os.system('git clone https://github.com/lyx12311/FLICKER.git')
    #os.system('cd FLICKER')
    #os.system('python setup.py install')
    #os.system('cd ..')

[docs]def getTrainF():
    """Print out needed featuers for the model in order.
    
    """
    Features_class=['LG_peaks', 'Rvar', 'parallax', 'radius_percentile_lower', 'radius_percentile_upper', 'phot_g_mean_flux_over_error', 'bp_g']
    Features_regr=['teff','bp_g','lum_val','v_tan','phot_g_mean_flux_over_error','v_b','radius_val','b','Rvar','flicker']
    print(">>> classification features are: ['LG_peaks [Lomb-Scargle peak height]', 'Rvar [ppm]', 'parallax [gaia]', 'radius_percentile_lower [gaia]', 'radius_percentile_upper [gaia]', 'phot_g_mean_flux_over_error [gaia]', 'bp_g [gaia]']\n\n")
    print(">>> regression features are: ['teff [gaia]','bp_g [gaia]','lum_val [gaia]','v_tan [getVs()]','phot_g_mean_flux_over_error [gaia]','v_b [getVs()]','radius_val [gaia]','b [gaia]','Rvar [ppm]','flicker [FLICKER]']\n\n")
    return Features_class,Features_regr

[docs]def getKeplerProt(X_pred):
    """Predict rotation period from trained models. 
    
    This function predicts rotation periods for stars in the Kepler field. The models are trained on rotation periods from 
    `McQuillian et all. (2014) <https://arxiv.org/abs/1402.5694>`_, `Santos et all. (2019) <https://arxiv.org/abs/1908.05222>`_ 
    and `Garcia et all. (2014) <https://arxiv.org/abs/1403.7155>`_. If the models are not already downloaded, this tool will download 
    the model which might take a couple of minues. It first passes the stars through a classifier, which identifies what stars 
    have measureable rotation periods. Then it uses two regressor models (one with 1 estimator and another one with 100 estimators) 
    to predict rotation periods. If column "Prot" exist, it will also output the true periods associated with the predicted periods.
    The light curve feature "flicker" can be calculated using software `FLICKER <https://flicker.readthedocs.io/en/latest/>`_.
    
    Args:
      X_pred ([Pandas DataFrame]): DataFrame contains all variables needed, run Astraea.getTrainF() to print out requirements
      
    Returns: 
      <pandas.DataFrame> or <pandas.Series>:
      :Containing:
   
        :TrueProt: True rotation period (if avaliable)
	:Prot_prediction_1est: Period predictions with 1 estimator
	:Prot_prediction_100est: Period prediction with 100 estimators
      
      
    """
    # see what features are needed
    Features_class,Features_regr=getTrainF()
    
    # check if satisfy classification features, if not, exit function
    try:
        X_pred[Features_class]
	
    except KeyError:
        print("Make sure ['LG_peaks', 'Rvar', 'parallax', 'radius_percentile_lower', 'radius_percentile_upper', 'phot_g_mean_flux_over_error', 'bp_g'] are in DataFrame for classifier!")
        print("Exiting...")
        return None
    
    # check if satisfy regressor features
    try:
        X_pred[Features_regr]
    except KeyError:
        print("Make sure ['teff','bp_g','lum_val','v_tan','phot_g_mean_flux_over_error','v_b','radius_val','b','Rvar','flicker'] are in DataFrame for regressor!")
        print("Exiting...")
        return None
    
    # load random forest models
    RF_class,RF_regr_1,RF_regr_100=load_RF()
    
    # get subset dropping NAs
    X_pred_class=X_pred[np.append(Features_regr,Features_class)].dropna()
    
    # drop duplicate columns
    X_pred_class=X_pred_class.loc[:,~X_pred_class.columns.duplicated()]
    
    # classify first...
    print("Total "+str(len(X_pred))+" stars!")
    print('Classifing '+str(len(X_pred_class))+' stars!')
    
    probs = RF_class.predict_proba(X_pred_class[Features_class])
    preds = probs[:,1]
    for i in range(len(preds)):
        if preds[i]<=0.4:
            preds[i]=0
        else:
            preds[i]=1
    print(str(sum(preds))+' stars have predictable rotation periods ('+ str(float(sum(preds))/float(len(X_pred))*100)+'%)')
    
    X_pred_class['Prot_class']=preds
    
    # pick out stars that have rotation periods
    X_pred_regr=X_pred_class[Features_regr]
    
    print("Predicting rotation periods!")
    
    
    # predict rotation periods with 1 estimator
    Prot_predictions_1est=RF_regr_1.predict(X_pred_regr.values)
    
    # predict rotation periods with 100 estimator
    Prot_predictions_100est=RF_regr_100.predict(X_pred_regr.values)
    
    print("Finished!")
    
    try:
        TrueProt=X_pred.iloc[X_pred_class.index].Prot.values
        try:
            TrueProt_err=X_pred.iloc[X_pred_class.index].Prot_err.values
            return pd.DataFrame(np.array((TrueProt,TrueProt_err, Prot_predictions_1est, Prot_predictions_100est)).T,columns=['True Prot','True Prot_err','Prot prediction w/ 1 est','Prot prediction w/ 100 est'])	
        except:
            return pd.DataFrame(np.array((TrueProt, Prot_predictions_1est, Prot_predictions_100est)).T,columns=['True Prot','Prot prediction w/ 1 est','Prot prediction w/ 100 est'])	
    except:
        print("Can't identify true rotation period! Does not exist or rename column to 'Prot'!")
        return pd.DataFrame(np.array((Prot_predictions_1est, Prot_predictions_100est)).T,columns=['Prot prediction w/ 1 est','Prot prediction w/ 100 est'])
    
    

"""--------------------------------------------- end of function to use trained RF to get Prot ---------------------------------------------"""



"""--------------------------------------------- start of calculating light curve statistics ---------------------------------------------"""
[docs]def getRvar(Flux):
    """Calculates light curve Rvar
    
    Args:
      Flux: The light curve flux in ppm
    
    Returns:
      Rvar ([float]): The variability of the light curve
    """
    return abs(np.percentile(Flux,95)-np.percentile(Flux,5))

from astropy.timeseries import LombScargle
[docs]def getLGpeak(t,sig,sig_err):
    """Calculates the location of the highest peak and the maximum power of the hightest peak
    
    Args:
      t: Time [days]
      sig: Flux
      sig_err: Flux error
    
    Returns:
      :LG_Prot ([float]): The period calculated from Lomb-Scargle
      :LG_peaks ([float]): The maximum peak height
    """
    
    # lomb-scargle
    # get frequencies
    freq = np.linspace(1./abs(t[-1]-t[0]), 1./abs(t[1]-t[0])*0.5, 100000)
    # periods 
    ps = 1./freq
    power = LombScargle(t,sig,sig_err).power(freq)
    peaks = np.array([i for i in range(1, len(ps)-1) if power[i-1]<power[i] and power[i+1] < power[i]])

    return ps[power == max(power[peaks])][0],max(power[peaks])

# calcualte v_t, v_b by passing in a dataframe with parallax, pmra, pmdec, ra, dec
[docs]def getVs(df):
    """ Calculates tangential velocity (v_tan) and vertical velocity proximation (v_b).
    
    Args:
      df ([Pandas DataFrame]): DataFrame contains columns 'parallax', 'pmra', 'pmdec', 'ra', 'dec', which are parallax, ra proper motion, dec propermotion, right ascension and declination, respectively  
    
    Returns:
      :v_t ([array-like]): Tangential velocity
      :v_b ([array-like]): Proxy for vertical velocity
    """
    d = coord.Distance(parallax=np.array(df.parallax) * u.mas,allow_negative=True)
    vra = (np.array(df.pmra)*u.mas/u.yr * d).to(u.km/u.s, u.dimensionless_angles())
    vdec = (np.array(df.pmdec)*u.mas/u.yr * d).to(u.km/u.s, u.dimensionless_angles())
    v_t=np.sqrt(np.power(vra,2.)+np.power(vdec,2.)) # vtan
    # v_b as a proxy for v_z:
    c = coord.SkyCoord(ra=np.array(df.ra)*u.deg, dec=np.array(df.dec)*u.deg, distance=d,
	              pm_ra_cosdec=np.array(df.pmra)*u.mas/u.yr,
	              pm_dec=np.array(df.pmdec)*u.mas/u.yr)
    gal = c.galactic
    v_b = (gal.pm_b * gal.distance).to(u.km/u.s, u.dimensionless_angles()) # vb
    return v_t,v_b
    
    
[docs]def getFlicker(t,sig):
    """Calculates the Flicker value
    
    Args:
      t: Time [days]
      sig: Flux
    
    Returns:
      :Flicker ([float]): Flicker value
    """
    try:
        import FLICKER
    except:
        FLICKERinstall()
        import FLICKER
    return FLICKER.Flicker(t,sig)



"""--------------------------------------------- end of calculating light curve statistics ---------------------------------------------"""
    

"""--------------------------------------------- star of functions not related to RF --------------------------------------------- """

# calculates chisq
def calcChi(TrueVal,PreVal,TrueVal_err):
    """Calculate average chisq value and ignore stars without error measurements
    
    Args:
      TrueVal ([array-like]): True values to compare to
      PreVal ([array-like]): Predicted values
      TrueVal_err ([array-like]): Errors for true values
    
    Returns:
      ave_chi ([float]): Average chisq value
    """
    validv=0
    for i in range(len(TrueVal)):
        if TrueVal_err[i]==0 or TrueVal_err[i]==np.nan:
            TrueVal[i]=0
            PreVal[i]=0
            TrueVal_err[i]=1
            validv=validv+1
    ave_chi=sum([(TrueVal[i]-PreVal[i])**2./TrueVal_err[i] for i in range(len(TrueVal_err))])/(len(PreVal)-validv)
    return ave_chi
    
# calculates median relative error
def MRE(TrueVal,PreVal,TrueVal_err=[]):
    """Calculate median relative error 
    
    Args:
      TrueVal ([array-like]): True values to compare to
      PreVal ([array-like]): Predicted values
      TrueVal_err (Optional [array-like]): Errors for true values
    
    Returns:
      meree ([float]): Median relative error
    """
    validv=0
    meree=np.median([abs(TrueVal[i]-PreVal[i])/TrueVal[i] for i in range(len(TrueVal))])
    return meree


# plot different features vs Prot
[docs]def plot_corr(df,y_vars,x_var='Prot',logplotarg=[],logarg=[],MS=1):
    """Plot correlations on one variable vs other variables specified by user
    
    Args:
         df ([Pandas DataFrame]): DataFrame contains all variables needed
      y_vars ([string list]): List of variables on y axis
      x_var (optional [string]): Value for all x axis 
      logplotarg (Optional [string list]): Variables to plot in loglog scale
      logarg (Optional [string] or [string list]): 'loglog' or 'logx' or 'logy' (default is linear). If it is a list, each argument in *logplotarg* correspond to each scale in *logarg* in order 
      MS (Optional [int]): Markersize for plotting 
    
    Returns:
      <matplotlib.plot>: plots for feature correlations
    """
    # add in Prot
    Prot=df[x_var]
    df=df[y_vars].dropna()
    Prot=Prot[df.index]
    topn=len(y_vars)
    # get subplot config
    com_mul=[] 
    # get all multiplier
    for i in range(1,topn):
        if float(topn)/float(i)-int(float(topn)/float(i))==0:
            com_mul.append(i)
        
    # total rows and columns
    col=int(np.median(com_mul))
    row=int(topn/col)
    if col*row<topn:
        if col<row:
            row=row+1
        else:
            col=col+1
        
    # plot feature vs Prot
    plt.figure(figsize=(int(topn*2.5),int(topn*2.5)))
    for i in range(topn):
        plt.subplot(row,col,i+1)
        featurep=df[y_vars[i]]
        if len(logarg)==1:
            if y_vars[i] in logplotarg:
                if logarg=='loglog':
                    plt.loglog(Prot,featurep,'k.',markersize=MS)
                elif logarg=='logx':
                    plt.semilogx(Prot,featurep,'k.',markersize=MS)
                elif logarg=='logy':
                    plt.semilogy(Prot,featurep,'k.',markersize=MS)
            else:
                plt.plot(Prot,featurep,'k.',markersize=MS)
        else:
            if y_vars[i] in logplotarg:
                logsca=logarg[logplotarg.index(y_vars[i])]
                if logsca=='loglog':
                    plt.loglog(Prot,featurep,'k.',markersize=MS)
                elif logsca=='logx':
                    plt.semilogx(Prot,featurep,'k.',markersize=MS)
                elif logsca=='logy':
                    plt.semilogy(Prot,featurep,'k.',markersize=MS)
            else:
                plt.plot(Prot,featurep,'k.',markersize=MS)
		    
        plt.title(y_vars[i],fontsize=25)
        stddata=np.std(featurep)
        plt.ylim([np.median(featurep)-3*stddata,np.median(featurep)+3*stddata])
        plt.xlabel(x_var)
        plt.ylabel(y_vars[i])
	
"""--------------------------------------------- end of functions not related to RF --------------------------------------------- """




"""--------------------------------------------- RF training and results --------------------------------------------- """
 
# RF classifier 
[docs]def RFclassifier(df,testF,modelout=False,traind=0.8,ID_on='KID',X_train_ind=[],X_test_ind=[],target_var='Prot_flag',n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None):
    """Train RF classifier model and predict values for cross-validation dataset. 
    
    It uses scikit-learn Random Forest classifier model. All default hyper-parameters are taken from the scikit-learn model that user can change by adding in optional inputs. More details on hyper-parameters, see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html. To use the module to train a RF model to predict rotation period, input a pandas dataFrame with column names as well as a list of attribute names. 
    
    Args:
      df ([Pandas DataFrame]): DataFrame contains all variables needed
      testF ([string list]): List of feature names used to train
      modelout (Optional [bool]): Whether to only output the trained model 
      traind (Optinal [float]): Fraction of data use to train, the rest will be used to perform cross-validation test (default 0.8)
      ID_on (Optional [string]): What is the star identifier column name (default 'KID'). If specified ID column does not exist, it will just take the index as ID
      X_train_ind (Optional [list]): List of *ID_on* for training set, if not specified, take random *traind* fraction of indexes from *ID_on* column
      X_test_ind (Optional [list]): List of *ID_on* for testing set, if not specified, take the remaining (1-*traind*) fraction of indexes from *ID_on* column that is not in the training set (*X_train_ind*)
      target_var (Optional [string]): Label column name (default 'Prot_flag')
      
    Returns: 
      <RF model>, <pandas.Series>:
      
      :regr: Sklearn RF classifier model (attributes see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
      
      :<pandas.Series> containing:
         
        :actrualF ([string list]): Actrual features used
        :importance ([float list]): Impurity-based feature importance ordering as *actrualF*
        :ID_train ([list]): List of *ID_on* used for training set 
        :ID_test ([list]): List of *ID_on* used for testing set
        :predictp ([float list]): List of prediction on testing set
        :X_test ([matrix]): Matrix used to predict label values for testing set
        :y_test ([array-like]): Array of true label values of testing set
        :X_train ([matrix]): Matrix used to predict label values for training set
        :y_train ([array-like]): Array of true label values of training set
      
      
    """
   
    print('Simpliest example:\n regr,regr_outs = RFregressor(df,testF)\n')

    if len(X_train_ind)==0:
        print('Fraction of data used to train:',traind)
    else:
        print('Training KID specified!\n')
        print('Estimated fraction of data used to train:',float(len(X_train_ind))/float(len(df[target_var])))
    print('# of Features attempt to train:',len(testF))
    print('Features attempt to train:',testF)

    # check if there is an ID
    if ID_on not in df.columns:
        df[ID_on]=range(len(df))
        print('ID column not found, using index as ID!')	
    
    fl=len(df.columns) # how many features
    keys=range(fl)
    flib=dict(zip(keys, df.columns))
    
    featl_o=len(df[target_var]) # old feature length before dropping
    
    actrualF=[] # actrual feature used
    # fill in feature array
    lenX=0
    missingf=[]
    for i in df.columns:
        feature=df[i].values
        if (type(feature[0]) is not str) and (i in testF):
            if sum(np.isnan(feature))<0.1*featl_o:
                lenX=lenX+1
                actrualF.append(i)
            else:
                missingf.append(i)
            
    X=df[actrualF]
    X=X.replace([np.inf, -np.inf], np.nan)
    X=X.dropna()

    featl=np.shape(X)[0]
    #print(featl)
    print(str(featl_o)+' stars in dataframe!')
    if len(missingf)!=0:
        print('Missing features:',missingf)
    if (featl_o-featl)!=0:
        print('Missing '+ str(featl_o-featl)+' stars from null values in data!\n')

    print(str(featl)+' total stars used for RF!')
    

    #print(X_train_ind)

    if len(X_train_ind)==0:
        # output
        y=df[target_var][X.index].values
        ID_ar=df[ID_on][X.index].values
        X=X.values
	
        Ntrain = int(traind*featl)
        # Choose stars at random and split.
        shuffle_inds = np.arange(len(y))
        np.random.shuffle(shuffle_inds)
        train_inds = shuffle_inds[:Ntrain]
        test_inds = shuffle_inds[Ntrain:]
	
        y_train, ID_train, X_train = y[train_inds],ID_ar[train_inds],X[train_inds, :]
        y_test, ID_test, X_test = y[test_inds], ID_ar[test_inds],X[test_inds, :]
	
        test_inds,y_test, ID_test, X_test=zip(*sorted(zip(test_inds,y_test, ID_test, X_test)))
        test_inds=np.array(test_inds)
        y_test=np.array(y_test)
        ID_test=np.array(ID_test)
        X_test=np.asarray(X_test)
	
    else:
        datafT=df.loc[X.index].loc[df[ID_on].isin(X_train_ind)]
        datafTes=df.loc[X.index].loc[df[ID_on].isin(X_test_ind)]
        y_train, X_train = datafT[target_var].values, X.loc[df[ID_on].isin(X_train_ind)].values
        y_test, X_test = datafTes[target_var].values, X.loc[df[ID_on].isin(X_test_ind)].values
    print(str(len(y_train))+' training stars!')



    # run random forest
    regr = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight)
    regr.fit(X_train, y_train)      

    # get the importance of each feature
    importance=regr.feature_importances_
    
    print('Finished training! Making predictions!')
    # make prediction
    predictp=regr.predict(X_test)
    print('Finished predicting!')
     
    if len(X_train_ind)!=0:
        ID_train=datafT[ID_on].values
        ID_test=datafTes[ID_on].values
        ID_train=[int(i) for i in ID_train]
        ID_test=[int(i) for i in ID_test]
    print('Finished!')
    return regr,pd.Series([importance,actrualF,ID_train,ID_test,predictp,X_test,y_test,X_train,y_train],index=['importance','actrualF','ID_train','ID_test','prediction','X_test','y_test','X_train','y_train'])
	
# RF regressor	 
[docs]def RFregressor(df,testF,modelout=False,traind=0.8,ID_on='KID',X_train_ind=[],X_test_ind=[],target_var='Prot',target_var_err='Prot_err',chisq_out=False,MREout=False,n_estimators=100,
criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False):
    """Train RF regression model and perform cross-validation test. 
    
    It uses scikit-learn Random Forest regressor model. All default hyper-parameters are taken from the scikit-learn model that user can change by adding in optional inputs. More details on hyper-parameters, see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html. To use the module to train a RF model to predict rotation period, input a pandas dataFrame with column names as well as a list of attribute names. 
    
    Args:
      df ([Pandas DataFrame]): DataFrame contains all variables needed
      testF ([string list]): List of feature names used to train
      modelout (Optional [bool]): Whether to only output the trained model 
      traind (Optinal [float]): Fraction of data use to train, the rest will be used to perform cross-validation test (default 0.8)
      ID_on (Optional [string]): What is the star identifier column name (default 'KID'). If specified ID column does not exist, it will just take the index as ID
      X_train_ind (Optional [list]): List of *ID_on* for training set, if not specified, take random *traind* fraction of indexes from *ID_on* column
      X_test_ind (Optional [list]): List of *ID_on* for testing set, if not specified, take the remaining (1-*traind*) fraction of indexes from *ID_on* column that is not in the training set (*X_train_ind*)
      target_var (Optional [string]): Label column name (default 'Prot')
      target_var_err (Optional [string]): Label error column name (default 'Prot_err')
      chisq_out (optional [bool]): If true, only output average chisq value
      MREout (optional [bool]): If true, only output median relative error. If both *chisq_out* and *MREout* are true, then output only these two values
      
    Returns: 
      <RF model>, <pandas.Series>:
      
      :regr: Sklearn RF regressor model (attributes see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
      
      :<pandas.Series> containing:
       
        :actrualF ([string list]): Actrual features used
	:importance ([float list]): Impurity-based feature importance ordering as *actrualF*
	:ID_train ([list]): List of *ID_on* used for training set 
	:ID_test ([list]): List of *ID_on* used for testing set
	:predictp ([float list]): List of prediction on testing set
	:ave_chi ([float]): Average chisq on cross-validation (testing) set
	:MRE_val ([float]): Median relative error on cross-validation (testing) set
	:X_test ([matrix]): Matrix used to predict label values for testing set
	:y_test ([array-like]): Array of true label values of testing set
	:X_train ([matrix]): Matrix used to predict label values for training set
	:y_train ([array-like]): Array of true label values of training set
      
      
    """
   
    print('Simpliest example:\n regr,regr_outs = RFregressor(df,testF)\n')

    if len(X_train_ind)==0:
        print('Fraction of data used to train:',traind)
    else:
        print('Training KID specified!\n')
        print('Estimated fraction of data used to train:',float(len(X_train_ind))/float(len(df[target_var])))
    print('# of Features attempt to train:',len(testF))
    print('Features attempt to train:',testF)

    # check if there is an ID
    if ID_on not in df.columns:
        df[ID_on]=range(len(df))
        print('ID column not found, using index as ID!')

    fl=len(df.columns) # how many features
    keys=range(fl)
    flib=dict(zip(keys, df.columns))
    
    featl_o=len(df[target_var]) # old feature length before dropping
    
    actrualF=[] # actrual feature used
    # fill in feature array
    lenX=0
    missingf=[]
    for i in df.columns:
        feature=df[i].values
        if (type(feature[0]) is not str) and (i in testF):
            if sum(np.isnan(feature))<0.1*featl_o:
                lenX=lenX+1
                actrualF.append(i)
            else:
                missingf.append(i)
            
    X=df[actrualF]
    X=X.replace([np.inf, -np.inf], np.nan)
    X=X.dropna()

    featl=np.shape(X)[0]
    #print(featl)
    print(str(featl_o)+' stars in dataframe!')
    if len(missingf)!=0:
        print('Missing features:',missingf)
    if (featl_o-featl)!=0:
        print('Missing '+ str(featl_o-featl)+' stars from null values in data!\n')

    print(str(featl)+' total stars used for RF!')
    

    #print(X_train_ind)

    if len(X_train_ind)==0:
        # output
        y=df[target_var][X.index].values
        y_err=df[target_var_err][X.index].values
        ID_ar=df[ID_on][X.index].values
        X=X.values
	
        Ntrain = int(traind*featl)
        # Choose stars at random and split.
        shuffle_inds = np.arange(len(y))
        np.random.shuffle(shuffle_inds)
        train_inds = shuffle_inds[:Ntrain]
        test_inds = shuffle_inds[Ntrain:]
	
        y_train, y_train_err, ID_train, X_train = y[train_inds], y_err[train_inds],ID_ar[train_inds],X[train_inds, :]
        y_test, y_test_err, ID_test, X_test = y[test_inds], y_err[test_inds],ID_ar[test_inds],X[test_inds, :]
	
        test_inds,y_test, y_test_err, ID_test, X_test=zip(*sorted(zip(test_inds,y_test, y_test_err, ID_test, X_test)))
        test_inds=np.array(test_inds)
        y_test=np.array(y_test)
        y_test_err=np.array(y_test_err)
        ID_test=np.array(ID_test)
        X_test=np.asarray(X_test)
	
    else:
        datafT=df.loc[X.index].loc[df[ID_on].isin(X_train_ind)]
        datafTes=df.loc[X.index].loc[df[ID_on].isin(X_test_ind)]
        y_train, y_train_err,X_train = datafT[target_var].values, datafT[target_var_err].values,X.loc[df[ID_on].isin(X_train_ind)].values
        y_test, y_test_err,X_test = datafTes[target_var].values, datafTes[target_var].values,X.loc[df[ID_on].isin(X_test_ind)].values
    print(str(len(y_train))+' training stars!')



    # run random forest
    regr = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start)
    regr.fit(X_train, y_train)  
    
    

    # get the importance of each feature
    importance=regr.feature_importances_
    
    print('Finished training! Making predictions!')
    # make prediction
    predictp=regr.predict(X_test)
    print('Finished predicting! Calculating statistics!')
     
    # calculate chisq and MRE
    MRE_val=MRE(y_test,predictp,y_test_err)
    ave_chi=calcChi(y_test,predictp,y_test_err)

    print('Median Relative Error is:',MRE_val)
    print('Average chi^2 is:',ave_chi)
    
    if chisq_out:
        if MREout:
            print('Finished!')
            return ave_chi,MRE_val
        else:
            print('Finished!')
            return ave_chi
    elif MREout:
        print('Finished!')
        return MRE_val
    else:
        if len(X_train_ind)!=0:
            ID_train=datafT[ID_on].values
            ID_test=datafTes[ID_on].values
            ID_train=[int(i) for i in ID_train]
            ID_test=[int(i) for i in ID_test]
        print('Finished!')
        return regr,pd.Series([importance,actrualF,ID_train,ID_test,predictp,ave_chi,MRE_val,X_test,y_test,X_train,y_train],index=['importance','actrualF','ID_train','ID_test','prediction','ave_chi2','MRE','X_test','y_test','X_train','y_train'])


# for plotting results for importance and predict vs true
[docs]def plot_result(actrualF,importance,prediction,y_test,y_test_err=[],topn=20,MS=3,labelName='Period'):
    """Plot impurity-based feature importance as well as predicted values vs true values for a random forest model
    
    Args:
      actrualF ([array-like]): Feature used (from function output of RFregressor())
      importance ([array-like]): importance of the model (from function output of RFregressor())
      prediction ([array-like]): Predicted values (from function output of RFregressor())
      y_test ([array-like]): true values (from function output of RFregressor())
      y_test_err (Optional [array-like]): Errors for true values (from function output of RFregressor())
      topn (Optional [int]): How many most important features to plot
      MS (Optional [int]): Markersize for plotting true vs predicted values
      labelName (Optional [string]): Label name
    
    Returns:
      <matplotlib.plot>: importance plot as well as true vs prediction plot
    """
    
    plt.rcParams.keys()
    plt.rc('font', family='serif')
    params = {
   'axes.labelsize': 30,
   'axes.linewidth': 1.5,
   'legend.fontsize': 25,
   'legend.frameon': False,
   'lines.linewidth': 2,
   'xtick.direction': 'in',
   'xtick.labelsize': 25,
   'xtick.major.bottom': True,
   'xtick.major.pad': 10,
   'xtick.major.size': 10,
   'xtick.major.width': 1,
   'xtick.minor.bottom': True,
   'xtick.minor.pad': 3.5,
   'xtick.minor.size': 5,
   'xtick.minor.top': True,
   'xtick.minor.visible': True,
   'xtick.minor.width': 1,
   'xtick.top': True,
   'ytick.direction': 'in',
   'ytick.labelsize': 25,
   'ytick.major.pad': 10,
   'ytick.major.size': 10,
   'ytick.major.width': 1,
   'ytick.minor.pad': 3.5,
   'ytick.minor.size': 5,
   'ytick.minor.visible': True,
   'ytick.minor.width': 1,
   'ytick.right': True,
    }
    plt.rcParams.update(params)

    topn=min([topn,len(actrualF)])
    # zip the importance with its feature name
    list1 = list(zip(actrualF,importance))
    # sort the zipped list
    decend=sorted(list1, key=lambda x:x[1],reverse=True)
    #print(decend)

    # how many features to plot 
    x=range(topn)
    
    ####################  get most important features ############################################################
    y_val=[decend[i][1] for i in range(topn)]
    my_xticks=[decend[i][0] for i in range(topn)]

    plt.figure(figsize=(20,5))
    plt.title('Most important features',fontsize=25)
    plt.xticks(x, my_xticks)
    plt.plot(x, y_val,'k-')
    plt.xlim([min(x),max(x)])
    plt.xticks(rotation=90)
    plt.ylabel('importance')
    ####################  get most important features ############################################################

    # prediction vs true
    if len(y_test_err)==0:
        plt.figure(figsize=(10,8))
        plt.plot(sorted(prediction),sorted(prediction),'k-',label='y=x')
        plt.plot(sorted(prediction),sorted(1.1*prediction),'b--',label='10% Error')
        plt.plot(sorted(prediction),sorted(0.9*prediction),'b--')
        plt.plot(y_test,prediction,'r.',Markersize=MS,alpha=0.2)
        plt.ylabel('Predicted '+labelName)
        plt.xlabel('True '+labelName)
        plt.ylim([min(prediction),max(prediction)])
        plt.xlim([min(prediction),max(prediction)])
        plt.legend()
    else:
        plt.figure(figsize=(20,8))
        plt.subplot(1,2,1)
        plt.plot(sorted(prediction),sorted(prediction),'k-',label='y=x')
        plt.plot(sorted(prediction),sorted(1.1*prediction),'b--',label='10% Error')
        plt.plot(sorted(prediction),sorted(0.9*prediction),'b--')
        plt.plot(y_test,prediction,'r.',Markersize=MS,alpha=0.2)
        plt.ylabel('Predicted '+labelName)
        plt.xlabel('True Period')
        plt.ylim([min(prediction),max(prediction)])
        plt.xlim([min(prediction),max(prediction)])
        plt.legend()
        plt.subplot(1,2,2)
        plt.plot(sorted(prediction),sorted(prediction),'k-',label='y=x')
        plt.plot(sorted(prediction),sorted(1.1*prediction),'b--',label='10% Error')
        plt.plot(sorted(prediction),sorted(0.9*prediction),'b--')
        plt.errorbar(y_test,prediction,xerr=y_test_err,fmt='r.',Markersize=MS,alpha=0.2)
        plt.ylabel('Predicted '+labelName)
        plt.xlabel('True '+labelName)
        plt.ylim([min(prediction),max(prediction)])
        plt.xlim([min(prediction),max(prediction)])
        plt.legend()
    
        avstedv=MRE(y_test,prediction,y_test_err)
        print('Median relative error is: ',avstedv)