How to use the gsutil.gs_path_from_course_id function in gsutil

To help you get started, we’ve selected a few gsutil examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mitodl / edx2bigquery / edx2bigquery / make_research_data_tables.py View on Github external
if not course_id_set:
            print "ERROR! Must specify list of course_id's for report.  Aborting."
            return

        org = course_id_set[0].split('/',1)[0]	# extract org from first course_id
        self.org = org

        self.output_project_id = output_project_id

        crname = ('course_report_%s' % org)
        if use_dataset_latest:
            crname = 'course_report_latest'
        self.dataset = output_dataset_id or crname

        self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
        self.course_id_set = course_id_set
	course_id = course_id_set

        #course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]
        #course_datasets_dict = { x:bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set}
	course_dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=use_dataset_latest )

	self.rdp_matrix = collections.OrderedDict()
        #for course_id in course_datasets_dict.keys():

	print "[researchData] Processing data for course %s" % ( course_id )
	sys.stdout.flush()
	for rdp in RESEARCH_DATA_PRODUCTS.keys():
		try:
			table = bqutil.get_bq_table_info( course_dataset, rdp )
			#table = bqutil.get_bq_table_info( course_id, rdp )
github mitodl / edx2bigquery / edx2bigquery / make_course_report_tables.py View on Github external
if not course_id_set:
            print "ERROR! Must specify list of course_id's for report.  Aborting."
            return

        org = course_id_set[0].split('/',1)[0]	# extract org from first course_id
        self.org = org

        self.output_project_id = output_project_id

        crname = ('course_report_%s' % org)
        if use_dataset_latest:
            crname = 'course_report_latest'
        self.dataset = output_dataset_id or crname

        self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
        self.course_id_set = course_id_set

        course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]

        # check to see which datasets have person_course tables
        datasets_with_pc = []
        self.all_pc_tables = OrderedDict()
        self.all_pcday_ip_counts_tables = OrderedDict()
        self.all_pcday_trlang_counts_tables = OrderedDict()
        self.all_uic_tables = OrderedDict()
        self.all_ca_tables = OrderedDict()
        self.all_va_tables = OrderedDict()
        self.all_tott_tables = OrderedDict()
        for cd in course_datasets:
            try:
                table = bqutil.get_bq_table_info(cd, 'person_course')
github mitodl / edx2bigquery / edx2bigquery / make_geoip_table.py View on Github external
raise
        ofp.close()

        lock_file(self.gipfn)
        try:
            print "--> renaming %s to %s" % (ofn, self.gipfn)
            sys.stdout.flush()
            os.rename(ofn, self.gipfn)
        except Exception as err:
            print "Error %s in renaming gipfn" % str(err)
        lock_file(self.gipfn, release=True)
        
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip']

        gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
        print "--> Uploading %s to %s" % (self.gipfn, gsp)
        sys.stdout.flush()
        gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')
        
        print "--> Importing %s to %s" % (gsp, self.giptable)
        sys.stdout.flush()
        try:
            bqutil.create_dataset_if_nonexistent(self.gipdataset)
        except Exception as err:
            print "--> Warning: failed to create %s, err=%s" % (gsp, err)
        try:
            bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema)
        except Exception as err:
            print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err)
            print "---> Continuing anyway"
            sys.stdout.flush()
github mitodl / edx2bigquery / edx2bigquery / make_video_analysis.py View on Github external
print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )

    # Lastly, create video stats
    createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
    createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

    # also create person_course_video_watched
    createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
github mitodl / edx2bigquery / edx2bigquery / make_grading_policy_table.py View on Github external
msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        raise Exception(msg)

    gpstr, gpfn = read_grading_policy_from_tar_file(fn)
    fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
    
    ofnb = 'grading_policy.csv'
    ofn = sdir / ofnb
    ofp = open(ofn, 'w')
    cdw = csv.DictWriter(ofp, fieldnames=fields)
    cdw.writeheader()
    cdw.writerows(gptab)
    ofp.close()

    # upload to google storage
    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
    
    # import into BigQuery
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
github mitodl / edx2bigquery / edx2bigquery / load_staff.py View on Github external
def do_staff_csv(staff_csv_fn):
    dataset = 'courses'
    table = 'staff'
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    gsfn = gsutil.gs_path_from_course_id('courses') / 'staff.csv'
    gsutil.upload_file_to_gs(staff_csv_fn, gsfn)

    schema = json.loads(open('%s/schemas/schema_staff.json' % mypath).read())['staff']
    bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)
github mitodl / edx2bigquery / edx2bigquery / make_combined_person_course.py View on Github external
cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn, firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
github mitodl / edx2bigquery / edx2bigquery / axis2bigquery.py View on Github external
if ca['start'] is not None:
            ca['start'] = str(ca['start'])	# datetime to string
        if  ca['due'] is not None:
            ca['due'] = str(ca['due'])	# datetime to string
        if (ca['data'] is None) or (ca['data']==''):
            ca.pop('data')
        check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
        try:
            # db.course_axis.insert(ca)
            fp.write(json.dumps(ca)+'\n')
        except Exception as err:
            print "Failed to save!  Error=%s, data=%s" % (err, ca)
    fp.close()

    # upload axis.json file and course xbundle
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
    if 1:
        gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
        gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)

    # import into BigQuery
    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)	# create dataset if not already existent
    table = "course_axis"
    bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)

    msg = "="*100 + '\n'
    msg += "Course axis for %s\n" % (cid)
    msg += "="*100 + '\n'
    msg += '\n'.join(log_msg)
    msg = msg[:16184]		# max message length 16384