How to use the ijson.backends.yajl2.items function in ijson

To help you get started, we’ve selected a few ijson examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Softcatala / translation-memory-tools / source-pos / wikidata / json-to-po.py View on Github external
def _process_json(filename):
    if filename is None:
        return None

    cnt = 0
    selected = 0
    onlyArticles = True
    PO_NAME = 'wikidata.po'
    SAVE_INTERVAL = 1000
    PROCESS_NOF_ENTRIES = 2 * 1000 * 1000

    po_file = _create_empty_po_file()

    with open(filename, 'r') as json_data:
        value = ijson.items(json_data, 'item')

        for item in value:
            label = item.get('labels')
            if label is None:
                continue

            item_id = item['id']
            if onlyArticles is True:
                if item_id is None or item_id.startswith("Q") is False:
                    continue

            comment = u'Article {0}'.format(item_id)
            en_label = label.get('en')
            ca_label = label.get('ca')

            if en_label is None or ca_label is None:
github sunlightlabs / thezombies / thezombies / tasks / validation.py View on Github external
agency = Agency.objects.get(id=agency_id)

        except Agency.DoesNotExist as e:
            logger.exception(e)
            raise e

    # Get schema info (schema path, dataset_prefix)
    schema_info = JSON_SCHEMAS.get(schema, None)

    with transaction.atomic():
        audit = Audit.objects.create(agency_id=agency_id, audit_type=Audit.DATA_CATALOG_VALIDATION)

    try:
        with closing(open_streaming_response('GET', agency.data_json_url)) as resp:
            # Use the schema dataset_prefix to get an iterator for the items to be validated.
            objects = ijson.items(resp.raw, schema_info.get('dataset_prefix', ''))

            default_args = {'json_schema_name': schema, 'source_url': agency.data_json_url}
            if audit:
                default_args.update({'audit_id': audit.id})

            # We're going to spin off async tas
            tasks = []
            for num, obj in enumerate(objects):
                args = default_args.copy()
                args.update({'json_object': obj, 'object_position': num})
                task = validate_json_object.apply_async(args=(args,), countdown=(num % COUNTDOWN_MODULO))
                tasks.append(task)

    except Exception as e:
        logger.exception(e)
github nlpaueb / BioIR / produce_centroids.py View on Github external
def calculate_centroids(self):
        if os.path.exists(self.centroids_file):
            os.remove(self.centroids_file)

        f = open(self.corpus_file, 'r')
        objects = ijson.items(f, 'articles.item')
        i = 0
        idmap = {}
        cent_array = []
        for article in objects:
            abstract_text = article["abstractText"]
            abstract_id = article["pmid"]
            text = article["title"] + " " + abstract_text

            centroid = get_centroid_idf(text, self.emb, self.idf, self.stopwords, self.dim)

            cent_array.append(np.array(centroid, dtype=np.float32))

            idmap[i] = abstract_id
            i += 1
        final_cent_array = np.array(cent_array, dtype=np.float32).reshape((i, self.dim))
        print final_cent_array.shape
github nlpaueb / BioIR / reranking.py View on Github external
def initialize(self):
        f = open(self.ret_file, 'r')
        data_q = json.load(f)
        abstracts_needed = set()
        for i in range(len(data_q["questions"])):
            abstracts_needed = abstracts_needed | set(data_q["questions"][i]["retrieved"])
        f.close()

        print "Collecting Abstracts.."
        f = open(self.corpus_file, 'r')
        corpus = ijson.items(f, 'articles.item')
        for article in corpus:
            pmid = article["pmid"]
            if pmid in abstracts_needed:
                self.corpus_index[pmid] = article["title"] + ' ' + article["abstractText"]
                abstracts_needed.remove(pmid)
                if not abstracts_needed:
                    break
        f.close()


        print len(self.corpus_index)
        q_array_q = []
        q_array_d = []
        q_array_max = []
        print "Reranking.."
        n_questions = len(data_q["questions"])