Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data from many reports,, including stats_course_content, the medians report, the listings file,
broad_stats_by_course, and time_on_task_stats_by_course.
'''
if do_upload:
if use_dataset_latest:
org = "latest"
else:
org = courses[0].split('/',1)[0] # extract org from first course_id in courses
crname = 'course_report_%s' % org
gspath = gsutil.gs_path_from_course_id(crname)
gsfnp = gspath / CCDATA
gsutil.upload_file_to_gs(CCDATA, gsfnp)
tableid = "stats_course_content"
dataset = crname
mypath = os.path.dirname(os.path.realpath(__file__))
SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath
try:
the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
except Exception as err:
print "Oops! Failed to load schema file for %s. Error: %s" % (tableid, str(err))
raise
if 0:
bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False,
format='csv', skiprows=1)
statbuf = os.stat(fn)
mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)
# do some date checking to upload files which have changed, and are newer than that on google cloud storage
local_dt = local.localize(mt, is_dst=None)
utc_dt = local_dt.astimezone (pytz.utc)
fnb = os.path.basename(fn)
if fnb in fnset and fnset[fnb]['date'] > utc_dt:
print "...%s already copied, skipping" % fn
sys.stdout.flush()
return
elif fnb in fnset:
print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt)
gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True)
if ofnb in gsfiles:
print "Already have %s, skipping" % ofnb
sys.stdout.flush()
continue
# dump tracking log of certain date using mongoexport, if needed
if not os.path.exists(ofn):
# db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring",
# 'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count()
query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % (course_id, start, end)
cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % (dbname, collection, query, ofn)
# print cmd
os.system(cmd)
upload_file_to_gs(ofn, gspath + '/' + ofnb)
filebuf.append(ofn)
if len(filebuf)>20:
ffn = filebuf.pop(0)
os.unlink(ffn)
print "...Deleted %s" % ffn
sys.stdout.flush()
ca['due'] = str(ca['due']) # datetime to string
if (ca['data'] is None) or (ca['data']==''):
ca.pop('data')
check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
try:
# db.course_axis.insert(ca)
fp.write(json.dumps(ca)+'\n')
except Exception as err:
print "Failed to save! Error=%s, data=%s" % (err, ca)
fp.close()
# upload axis.json file and course xbundle
gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
if 1:
gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)
# import into BigQuery
dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent
table = "course_axis"
bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)
msg = "="*100 + '\n'
msg += "Course axis for %s\n" % (cid)
msg += "="*100 + '\n'
msg += '\n'.join(log_msg)
msg = msg[:16184] # max message length 16384
bqutil.add_description_to_table(dataset, table, msg, append=True)
print " Done - inserted %s records into course_axis" % len(caset)
csvfn = sdir / csv_name
tempfn = sdir / temp_name
mypath = os.path.dirname(os.path.realpath(__file__))
the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]
if not os.path.exists(csvfn):
print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
return
if not subsection:
cleanup_rows_from_grade_persistent(csvfn, tempfn)
else:
cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted")
gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)
dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent
bqutil.load_data_to_table(dataset,
table,
gsdir / csv_name,
the_schema,
format="csv",
skiprows=1)
clm_table = "course_listing_and_metainfo"
clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table)
clm_schema = json.loads(open(clm_schema_file).read())
clm = {}
for finfo in clm_schema:
field = finfo['name']
clm[field] = cmfields.get(field)
clm_fnb = clm_table + ".json"
clm_fn = course_dir / clm_fnb
open(clm_fn, 'w').write(json.dumps(clm))
gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
print "--> Course listing + metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, clm_table)
sys.stdout.flush()
gsutil.upload_file_to_gs(clm_fn, gsfnp)
bqutil.load_data_to_table(dataset, clm_table, gsfnp, clm_schema, wait=True, verbose=False)
# output course_metainfo
table = 'course_metainfo'
dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO
print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table)
sys.stdout.flush()
gsutil.upload_file_to_gs(csvfn, gsfnp)
mypath = os.path.dirname(os.path.realpath(__file__))
SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
the_schema = json.loads(open(SCHEMA_FILE).read())[table]
gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
print "--> Course listing + metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, clm_table)
sys.stdout.flush()
gsutil.upload_file_to_gs(clm_fn, gsfnp)
bqutil.load_data_to_table(dataset, clm_table, gsfnp, clm_schema, wait=True, verbose=False)
# output course_metainfo
table = 'course_metainfo'
dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO
print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table)
sys.stdout.flush()
gsutil.upload_file_to_gs(csvfn, gsfnp)
mypath = os.path.dirname(os.path.realpath(__file__))
SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
the_schema = json.loads(open(SCHEMA_FILE).read())[table]
bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
sys.stdout.flush()
if not videoAxisExists or force_recompute:
force_recompute = True
createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)
# Get video lengths
va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
va_bqdata = va['data']
fileoutput = lfp / FILENAME_VIDEO_AXIS
getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )
# upload and import video axis
gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
gsutil.upload_file_to_gs(fileoutput, gsfn)
table = TABLE_VIDEO_AXIS
bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)
else:
print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )
# Lastly, create video stats
createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
# also create person_course_video_watched
createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
if ca['due'] is not None:
ca['due'] = str(ca['due']) # datetime to string
if (ca['data'] is None) or (ca['data']==''):
ca.pop('data')
check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
try:
# db.course_axis.insert(ca)
fp.write(json.dumps(ca)+'\n')
except Exception as err:
print "Failed to save! Error=%s, data=%s" % (err, ca)
fp.close()
# upload axis.json file and course xbundle
gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
if 1:
gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)
# import into BigQuery
dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent
table = "course_axis"
bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)
msg = "="*100 + '\n'
msg += "Course axis for %s\n" % (cid)
msg += "="*100 + '\n'
msg += '\n'.join(log_msg)
msg = msg[:16184] # max message length 16384
bqutil.add_description_to_table(dataset, table, msg, append=True)