Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not course_id_set:
print "ERROR! Must specify list of course_id's for report. Aborting."
return
org = course_id_set[0].split('/',1)[0] # extract org from first course_id
self.org = org
self.output_project_id = output_project_id
crname = ('course_report_%s' % org)
if use_dataset_latest:
crname = 'course_report_latest'
self.dataset = output_dataset_id or crname
self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
self.course_id_set = course_id_set
course_id = course_id_set
#course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]
#course_datasets_dict = { x:bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set}
course_dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=use_dataset_latest )
self.rdp_matrix = collections.OrderedDict()
#for course_id in course_datasets_dict.keys():
print "[researchData] Processing data for course %s" % ( course_id )
sys.stdout.flush()
for rdp in RESEARCH_DATA_PRODUCTS.keys():
try:
table = bqutil.get_bq_table_info( course_dataset, rdp )
#table = bqutil.get_bq_table_info( course_id, rdp )
if not course_id_set:
print "ERROR! Must specify list of course_id's for report. Aborting."
return
org = course_id_set[0].split('/',1)[0] # extract org from first course_id
self.org = org
self.output_project_id = output_project_id
crname = ('course_report_%s' % org)
if use_dataset_latest:
crname = 'course_report_latest'
self.dataset = output_dataset_id or crname
self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
self.course_id_set = course_id_set
course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]
# check to see which datasets have person_course tables
datasets_with_pc = []
self.all_pc_tables = OrderedDict()
self.all_pcday_ip_counts_tables = OrderedDict()
self.all_pcday_trlang_counts_tables = OrderedDict()
self.all_uic_tables = OrderedDict()
self.all_ca_tables = OrderedDict()
self.all_va_tables = OrderedDict()
self.all_tott_tables = OrderedDict()
for cd in course_datasets:
try:
table = bqutil.get_bq_table_info(cd, 'person_course')
raise
ofp.close()
lock_file(self.gipfn)
try:
print "--> renaming %s to %s" % (ofn, self.gipfn)
sys.stdout.flush()
os.rename(ofn, self.gipfn)
except Exception as err:
print "Error %s in renaming gipfn" % str(err)
lock_file(self.gipfn, release=True)
mypath = os.path.dirname(os.path.realpath(__file__))
the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip']
gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
print "--> Uploading %s to %s" % (self.gipfn, gsp)
sys.stdout.flush()
gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')
print "--> Importing %s to %s" % (gsp, self.giptable)
sys.stdout.flush()
try:
bqutil.create_dataset_if_nonexistent(self.gipdataset)
except Exception as err:
print "--> Warning: failed to create %s, err=%s" % (gsp, err)
try:
bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema)
except Exception as err:
print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err)
print "---> Continuing anyway"
sys.stdout.flush()
print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date)
sys.stdout.flush()
if not videoAxisExists or force_recompute:
force_recompute = True
createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)
# Get video lengths
va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
va_bqdata = va['data']
fileoutput = lfp / FILENAME_VIDEO_AXIS
getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )
# upload and import video axis
gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
gsutil.upload_file_to_gs(fileoutput, gsfn)
table = TABLE_VIDEO_AXIS
bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)
else:
print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )
# Lastly, create video stats
createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
# also create person_course_video_watched
createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
raise Exception(msg)
gpstr, gpfn = read_grading_policy_from_tar_file(fn)
fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
ofnb = 'grading_policy.csv'
ofn = sdir / ofnb
ofp = open(ofn, 'w')
cdw = csv.DictWriter(ofp, fieldnames=fields)
cdw.writeheader()
cdw.writerows(gptab)
ofp.close()
# upload to google storage
gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
# import into BigQuery
table = "grading_policy"
dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def do_staff_csv(staff_csv_fn):
dataset = 'courses'
table = 'staff'
bqutil.create_dataset_if_nonexistent(dataset)
mypath = os.path.dirname(os.path.realpath(__file__))
gsfn = gsutil.gs_path_from_course_id('courses') / 'staff.csv'
gsutil.upload_file_to_gs(staff_csv_fn, gsfn)
schema = json.loads(open('%s/schemas/schema_staff.json' % mypath).read())['staff']
bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)
cmd = "zcat %s > %s" % (zfn, ofn)
header = os.popen("zcat %s | head -1" % zfn).read().strip()
firstfn = zfn
else:
cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn) # first row is header; don't keep when concatenating
print cmd
first = 0
new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
if not header == new_header:
print "==> Warning! header mismatch for %s vs %s" % (zfn, firstfn)
print " %s has: %s" % (firstfn, header)
print " but %s has: %s" % (zfn, new_header)
sys.stdout.flush()
os.system(cmd)
gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)
print "="*77
print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
sys.stdout.flush()
cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
print cmd
os.system(cmd)
gsfn = gb + '/' + ofn
print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)
# import into BigQuery
crname = ('course_report_%s' % org)
if use_dataset_latest:
crname = 'course_report_latest'
dataset = output_dataset_id or crname
if ca['start'] is not None:
ca['start'] = str(ca['start']) # datetime to string
if ca['due'] is not None:
ca['due'] = str(ca['due']) # datetime to string
if (ca['data'] is None) or (ca['data']==''):
ca.pop('data')
check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
try:
# db.course_axis.insert(ca)
fp.write(json.dumps(ca)+'\n')
except Exception as err:
print "Failed to save! Error=%s, data=%s" % (err, ca)
fp.close()
# upload axis.json file and course xbundle
gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
if 1:
gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)
# import into BigQuery
dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent
table = "course_axis"
bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)
msg = "="*100 + '\n'
msg += "Course axis for %s\n" % (cid)
msg += "="*100 + '\n'
msg += '\n'.join(log_msg)
msg = msg[:16184] # max message length 16384