Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if len(refTypeList) == 2:
print('##########merging annotation data')
rstFolder = 'annotation_result'
if keepZeros:
rstFolder = rstFolder + '_keep_all_genes'
else:
rstFolder = rstFolder + '_keep_expressed_genes'
savefolder = os.path.join(savefolder, rstFolder)
for testMethod in testMethodList:
tomergeList = glob.glob(os.path.join(savefolder, "human*%s*.xlsx" % testMethod))
tomergeList = [i for i in tomergeList if "_Avg" not in i]
topAnnList = []
for tomerge in tomergeList:
#merge results
humanAnn = pd.read_excel(tomerge, header=[0,1,2])
humanAnn.columns = pd.MultiIndex.from_arrays([list(humanAnn.columns.get_level_values(0)),list(humanAnn.columns.get_level_values(2))], names=('identifier', 'annotation'))
mouseAnn = pd.read_excel(tomerge.replace('human_','mouse_'), header=[0,1,2])
mouseAnn.columns = pd.MultiIndex.from_arrays([list(mouseAnn.columns.get_level_values(0)),list(mouseAnn.columns.get_level_values(2))], names=('identifier', 'annotation'))
mergedAnn = pd.concat([humanAnn, mouseAnn])
#sort merged results
#split merged results to single-cell merged results
mergedAnnList = np.split(mergedAnn, len(mergedAnn.columns)/2, axis=1)
#annotate single-cell expression profiles in parallel
p = multiprocessing.Pool(coreNum)
resultList = p.map(SortAnno, mergedAnnList)
p.close()
p.join()
mergedAnn = pd.concat([i[0] for i in resultList], axis=1)
import os
import re
import math
import pandas as pd
from random import choice, randint, choices
from backstage.models import College, Major, AdmClass, Student,\
Teacher, ClassRoom, MajorPlan
from scoreManagement.models import Course, Teaching, MajorCourses
from django.db.utils import IntegrityError
base_dir = '../others/'
xls_file = '2016-CS-courses.xlsx'
data_frame = pd.read_excel(os.path.join(base_dir, xls_file))
def get_index():
print(data_frame.columns)
college = College.objects.get(name='信息科学与技术学院')
cs_courses = []
for item1, item2, item3 in zip(data_frame['课程代码'], data_frame['课程名称'], data_frame['学分']):
try:
c = Course.objects.filter(
cno=item1
)[0]
cs_courses.append(c)
except:
def programmer_2():
datafile = path + '/data/normalization_data.xls'
data = pd.read_excel(datafile, header=None)
print((data - data.min()) / (data.max() - data.min()))
print((data - data.mean()) / data.std())
print(data / 10**np.ceil(np.log10(data.abs().max())))
from calendars import DayCounts
dc = DayCounts('BUS/252', calendar='anbima')
# BW path
# file_path = r'C:\Users\gamarante\Dropbox\Aulas\Insper - Financas Quantitativas\VNA Raw.xlsx'
# macbook path
# file_path = r'/Users/gusamarante/Dropbox/Aulas/Insper - Financas Quantitativas/VNA Raw.xlsx'
# mac path
file_path = r'/Users/gustavoamarante/Dropbox/Aulas/Insper - Financas Quantitativas/VNA Raw.xlsx'
df_mensal = pd.read_excel(file_path, 'Mensal', index_col=0)
df_diario = pd.read_excel(file_path, 'Diario', index_col=0, na_values=['#N/A N/A'])
df_release = pd.read_excel(file_path, 'Release')
df_release.columns = ['Date', 'IPCA']
df = pd.DataFrame(index=pd.date_range('2003-03-18', 'today', freq='D'),
columns=['dia util', 'ultima virada', 'DU desde virada', 'DU entre viradas', 'time fraction',
'proj anbima', 'saiu IPCA', 'ultimo IPCA', 'proj IPCA', 'ultimo index', 'VNA'])
df.index.name = 'Date'
df['dia util'] = dc.isbus(df.index)
for d in tqdm(df.index, 'Filling "ultima virada"'):
if d.day >= 15:
df.loc[d, 'ultima virada'] = pd.datetime(d.year, d.month, 15)
else:
if d.month - 1 == 0:
df.loc[d, 'ultima virada'] = pd.datetime(d.year-1, 12, 15)
else:
def _attemptImport(fileName, sep=',', dec='.'):
"""Attempts to import file with specified settings and raises
ConditionsImportError if fails due to invalid format
:param filename: str
:param sep: str indicating the separator for cells (',', ';' etc)
:param dec: str indicating the decimal point ('.', '.')
:return: trialList, fieldNames
"""
if fileName.endswith(('.csv', '.tsv')):
trialsArr = pd.read_csv(fileName, encoding='utf-8-sig',
sep=sep, decimal=dec)
logging.debug(u"Read csv file with pandas: {}".format(fileName))
elif fileName.endswith(('.xlsx', '.xls', '.xlsm')):
trialsArr = pd.read_excel(fileName)
logging.debug(u"Read Excel file with pandas: {}".format(fileName))
# then try to convert array to trialList and fieldnames
unnamed = trialsArr.columns.to_series().str.contains('^Unnamed: ')
trialsArr = trialsArr.loc[:, ~unnamed] # clear unnamed cols
logging.debug(u"Clearing unnamed columns from {}".format(fileName))
trialList, fieldNames = pandasToDictList(trialsArr)
return trialList, fieldNames
print('csv file')
# csv file
#
data = pd.read_csv(path, low_memory=False)
elif path_ext in [dp_static.EXT_TSV, dp_static.EXT_TAB]:
print('Tab-delimited')
# Tab-delimited
#
data = pd.read_csv(path, delimiter='\t', low_memory=False)
elif path_ext in [dp_static.EXT_XLS, dp_static.EXT_XLSX]:
print('Excel file')
# Excel file
#
data = pd.read_excel(path)
else:
return err_resp('File extension not valid: %s' % path_ext)
except FileNotFoundError as err_obj:
return err_resp('File not found: %s' % err_obj)
except pd.errors.ParserError as err_obj:
return err_resp('Failed to open file: %s' % err_obj)
if 'd3mIndex' not in data:
data.insert(0, 'd3mIndex', range(len(data)))
return ok_resp(data)
def _read_data(self, data_dir):
df = pd.read_excel(data_dir)
return df
import requests
import json
import lxml.html as lh
from lxml.html import fromstring
import time
import os
from fuzzywuzzy import fuzz, process
# Set working directory
os.chdir('/Users/user/Projects/webDS/_util')
localDir = '03_Fuzzy_match_files/'
dbDir = '_django/loganalysis/'
# Bring in historical file of (somewhat edited) matches
GoldStandard = pd.read_excel('01_Import-transform_files/GoldStandard_master.xlsx')
#%%
# ===========================================================
# 2. FuzzyAutoAdd - When phrase-match score is 90 or higher,
# assign without manual checking
# ===========================================================
'''
Isolate terms that might be a minor misspelling or might be a foreign version
of the term. Some of these matches will be wrong, but overall, it's a good use
of time to assign to what they look very similar to. Here we set the scorer to
match whole terms/phrases, and the fuzzy matching score must be 90 or higher.
# Quick test, if you want - punctuation difference
fuzz.ratio('Testing FuzzyWuzzy', 'Testing FuzzyWuzzy!!')
def parse_alameda():
sovc_xls = 'https://www.acgov.org/rov/elections/20150317/documents/sovc.xls'
primary = pd.read_excel(sovc_xls, sheetname='Sheet1')
# Select only contests of interest and the important columns
primary = primary.loc[(primary.index > 3) & (primary.index < 385)][
['Alameda County', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12']]
table = prepare_output(primary, 'Alameda', 7,
['Terry Kremin', 'Susan Bonilla', 'Joan Buchanan', 'Michaela M. Hertle', 'Steve Glazer'])
for x in ['candidate', 'district', 'office', 'precinct', 'county']:
table = table.sort_values(by=x, kind='mergesort')
table.to_csv(
'2015/20150317__ca__special__primary__alameda__precinct.csv', header=output_columns, index=False)
def get_data_from_source(self):
data_frame = pd.read_excel(self.file, sheet_name=self.sheet_name, dtype=str).replace(np.nan, '', regex=True)
self._analyse_header(list(data_frame))
for row in data_frame.values.tolist():
self._read_data_from_table(row)
return self.data_table