How to use the pandas.merge function in pandas

github xiaozhouwang / kaggle_Microsoft_Malware / kaggle_Microsoft_malware_full / View on Github external
mine = mine.as_matrix()

    #mine_test = pd.merge(grams_test, test_daf,on='Id')
    mine_test = grams_test
    mine_test = pd.merge(mine_test, test_dll,on='Id')

    mine_test_id = mine_test.Id
    del mine_test['Id']
    clf_se = RF(n_estimators=500, n_jobs=-1,random_state = 0),mine_labels)
    mine_train = np.array(clf_se.transform(mine, '1.25*mean'))
    mine_test = np.array(clf_se.transform(mine_test, '1.25*mean'))

    train_mine = pd.DataFrame(np.column_stack((mine_Id, mine_train)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_train.shape[1])]).convert_objects(convert_numeric=True)
    test_mine = pd.DataFrame(np.column_stack((mine_test_id, mine_test)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_test.shape[1])]).convert_objects(convert_numeric=True)
    train = pd.merge(train, train_mine, on='Id')
    test = pd.merge(test, test_mine, on='Id')

    train_image = pd.read_csv("train_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
    test_image = pd.read_csv("test_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
    train = pd.merge(train, train_image, on='Id')
    test = pd.merge(test, test_image, on='Id')
    print "the data dimension:"
    print train.shape, test.shape
    return train, test
def gen_submission(model):
github BayAreaMetro / travel-model-one / utilities / PBA40 / metrics / View on Github external
# Join to persons to get person_id, fp_choice
    joint_tours = pandas.merge(left=joint_tours, right=persons,  on=['hh_id','person_num'])
    # Verify we didn't lose or add rows and that we found everyone's person id
    assert(len(joint_tours) == joint_tour_participants)
    assert(len(joint_tours.loc[pandas.notnull(joint_tours.person_id)] == joint_tour_participants))

    # drop tour_participants so we can merge
    joint_tours.drop('tour_participants', axis=1, inplace=True)
    assert(sorted(list(indiv_tours.columns.values)) == sorted(list(joint_tours.columns.values)))
    tours = pandas.concat([indiv_tours, joint_tours])

    # tour duration
    tours['tour_duration'] = tours.end_hour - tours.start_hour
    # origin county
    tours = pandas.merge(left=tours,         right=tazdata[['COUNTY']],
                         left_on='orig_taz', right_index=True)
    tours.rename(columns={'COUNTY':'orig_county'}, inplace=True)
    # destination county, parking costs
    tours = pandas.merge(left=tours,         right=tazdata,
                         left_on='dest_taz', right_index=True)
    tours.rename(columns={'COUNTY':'dest_county'}, inplace=True)
    assert(len(tours) == joint_tour_participants + indiv_tours_participants)

    # make sure this is a good index
    dupes = tours.duplicated(subset=['hh_id','person_id','person_num','tour_category','tour_purpose','tour_id'])

    tours['tour_purpose2'] = tours.tour_purpose  # duplicate for index
    tours.set_index(['hh_id','person_id','person_num','tour_category','tour_purpose2','tour_id'], inplace=True)

    # default: tour_duration * OPRKCST
github ozak / georasters / georasters / View on Github external
else LineString([x[0], x[0]]))
                    df2 = gp.GeoDataFrame(df2, crs=cea)
                if isolation:
                    df2['Iso'] = grisolation
                if count == 0:
                    self.grdist = df2.copy()
                    self.grdist = self.grdist.append(df2)
                count += 1
        if routes:
            self.grdist = gp.GeoDataFrame(self.grdist, crs=cea)
        if export_shape:
            start_pointscols = sources.columns.values
            end_pointscols = destinations.columns.values
            if 'geometry' in end_pointscols:
                self.grdist = pd.merge(self.grdist, end_points[['ID'] + end_pointscols.tolist()].drop('geometry', axis=1), left_on='ID2', right_on='ID', how='left')
                self.grdist = pd.merge(self.grdist, end_points[['ID']+end_pointscols.tolist()], left_on='ID2', right_on='ID', how='left')
            if 'geometry' in self.start_pointscols:
                self.grdist = pd.merge(self.grdist, start_points[['ID']+start_pointscols.tolist()].drop('geometry', axis=1), left_on='ID1', right_on='ID', how='left',
                             suffixes=['_2', '_1'])
                self.grdist = pd.merge(self.grdist, start_points[['ID']+start_pointscols.tolist()], left_on='ID1', right_on='ID', how='left',
                             suffixes=['_2', '_1'])
            self.grdist = gp.GeoDataFrame(self.grdist, crs=cea)
github duxuhao / JData-2018 / Features / View on Github external
def ActionFeatures(Startday, PrepareDays, PredictDays, temp, dftemp):
    tempfeature = temp[temp.a_day_series < Startday][temp.a_day_series >= Startday-PrepareDays].reset_index(drop=True)
    templabel = temp[temp.a_day_series >= Startday][temp.a_day_series < Startday+PredictDays].reset_index(drop=True)
    dftemp = pd.merge(dftemp, templabel[['user_id','a_date']].drop_duplicates(subset = 'user_id', keep='last'), on = 'user_id',how='left').fillna(0)
    Checkcnt = tempfeature[['user_id','a_date']].groupby(['user_id']).count().reset_index()
    Checkcnt.columns = ['user_id', 'checkcnt']
    dftemp = pd.merge(dftemp,Checkcnt, how = 'left', on = 'user_id')
    monthcnt = tempfeature[['user_id','a_month_series']].drop_duplicates().groupby(['user_id']).size().reset_index()
    monthcnt.columns = ['user_id', 'a_monthcnt']
    dftemp = pd.merge(dftemp,monthcnt, how = 'left', on = 'user_id')
    tempfeature['daybeforelastcheck'] = tempfeature.sort_values(by=['user_id','a_day_series']).a_day_series - tempfeature.sort_values(by=['user_id','a_day_series']).groupby(['user_id']).shift(1).a_day_series
    for f in ['daybeforelastcheck', 'price', 'para_1', 'para_2', 'para_3', 'a_num','a_type','a_month_series','a_day_series']:
        a = tempfeature[['user_id',f]].groupby(['user_id']).mean().reset_index()
        a.columns = ['user_id', '{}_a_ave'.format(f)]
        dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
        a = tempfeature[['user_id',f]].groupby(['user_id']).std().reset_index()
        a.columns = ['user_id', '{}_a_std'.format(f)]
        dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
        a = tempfeature[['user_id',f]].groupby(['user_id']).sum().reset_index()
        a.columns = ['user_id', '{}_a_sum'.format(f)]
        dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
        a = tempfeature[['user_id',f]].groupby(['user_id']).median().reset_index()
        a.columns = ['user_id', '{}_a_median'.format(f)]
github wai-i / Pair-Trading-Reinforcement-Learning / STRATEGY / View on Github external
def clean_data(cls, x, y, on, col_name):
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        y.replace([np.inf, -np.inf], np.nan, inplace=True)
        merged_df = pd.merge(left=x, right=y, on=on, how='outer')
        clean_df  = merged_df.loc[merged_df.notnull().all(axis=1), :]
        df_x = pd.DataFrame()
        df_y = pd.DataFrame()
        df_x[on] = clean_df[on].values
        df_y[on] = clean_df[on].values
        df_x[col_name] = clean_df[col_name + '_x'].values
        df_y[col_name] = clean_df[col_name + '_y'].values
        return df_x, df_y
github USEPA / Federal-LCA-Commons-Elementary-Flow-List / scripts / View on Github external
import fedelemflowlist
import pandas as pd
from fedelemflowlist.globals import outputpath

#Set name of mapping file. More than one mapping file can be used
mapping_to_use = ['openLCA']

if __name__ == '__main__':
    mapping = fedelemflowlist.get_flowmapping(mapping_to_use)
    #Get Flow UUIDs for flows used in selected mapping
    mapping_flow_uuids = pd.DataFrame(pd.unique(mapping['TargetFlowUUID']),columns=["Flow UUID"])

    #Get all flows
    all_flows = fedelemflowlist.get_flows()
    #Subset all flows to get just those used in selected mapping
    flows_used_in_mapping =  pd.merge(all_flows,mapping_flow_uuids)

    #Now write out flows and mappings
    export_name = ''
    for s in mapping_to_use:
        export_name = export_name + s + '_'
    export_name = export_name+ ''
github openelections / openelections-data-oh / 2008 / View on Github external
def make_attorney_df():
    df = pd.read_html(URLS['attorney general'])[0]
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])

    df.columns = ['county'] + list(df.columns[1:])
    df_ = pd.melt(df, id_vars=['county'], value_vars=list(df.columns[1:]))
    party_df = df_[pd.isnull(df_['county'])][['variable','value']]
    party_df.columns = ['candidate', 'party']
    df_.columns = ['county', 'candidate', 'votes']
    df_ = df_.dropna(subset=['county'])
    df_ = pd.merge(df_, party_df, how='left')
    df_['candidate'] = df_['candidate'].str.rstrip(' *')
    df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '')
    df_['candidate'] = df_['candidate'].str.rstrip('()')
    df_['office'] = 'Attorney General'
    attorney_df = df_

    return attorney_df
github abides-sim / abides / util / formatting / View on Github external
def extract_events_from_stream(stream_df, event_type):
    """ Extracts specific event from stream.

    events = stream_df.loc[stream_df.EventType == event_type][['EventTime', 'Event']]
    events_json = events['Event'].to_json(orient="records")
    json_struct = json.loads(events_json)
    # TODO : get rid of structs containing all `int` types
    event_extracted = json_normalize(json_struct)
    event_extracted = pd.merge(events['EventTime'].reset_index(), event_extracted, left_index=True, right_index=True)

    if not event_extracted.empty:
        event_extracted = event_extracted[['EventTime', 'order_id', 'limit_price', 'quantity', 'is_buy_order']]
        event_extracted.rename(columns={'EventTime': 'TIMESTAMP',
                                        'order_id': 'ORDER_ID',
                                        'limit_price': 'PRICE',
                                        'quantity': 'SIZE',
                                        'is_buy_order': 'BUY_SELL_FLAG'}, inplace=True)
        event_extracted = pd.DataFrame({
            'TIMESTAMP': [],
            'ORDER_ID': [],
            'PRICE': [],
            'SIZE': [],
            'BUY_SELL_FLAG': []
github CxAalto / gtfspy / gtfspy / View on Github external
def _compute_number_of_frequency_generated_stop_times(self, gtfs_source_path):
        Same as for "_frequency_generated_trips_rows" but for stop times table

        df_freq = self._frequency_generated_trips_rows(gtfs_source_path, return_df_freq=True)
        df_stop_times = source_csv_to_pandas(gtfs_source_path, "stop_times")
        df_stop_freq = pd.merge(df_freq, df_stop_times, how='outer', on='trip_id')
        return int(df_stop_freq['n_trips'].fillna(1).sum(axis=0))
github holoviz / datashader / datashader / View on Github external
Given a graph defined as a pair of dataframes (nodes and edges), the
    nodes (id, coordinates) and edges (id, source, target, weight) are
    joined by node id to create a single dataframe with each source/target
    of an edge (including its optional weight) replaced with the respective
    coordinates. For both nodes and edges, each id column is assumed to be
    the index.

    We also return the dimensions of each point in the final dataframe and
    the accumulator function for drawing to an image.

    df = pd.merge(edges, nodes, left_on=[params.source], right_index=True)
    df = df.rename(columns={params.x: 'src_x', params.y: 'src_y'})

    df = pd.merge(df, nodes, left_on=[], right_index=True)
    df = df.rename(columns={params.x: 'dst_x', params.y: 'dst_y'})

    df = df.sort_index()
    df = df.reset_index()

    if params.include_edge_id:
        df = df.rename(columns={'id': 'edge_id'})

    include_weight = params.weight and params.weight in edges

    if params.include_edge_id:
        if include_weight:
            segment_class = WeightedSegment
            segment_class = UnweightedSegment