How to use the ijson.items function in ijson

To help you get started, we’ve selected a few ijson examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github lyeoni / prenlp / prenlp / data / dataset / language_modeling.py View on Github external
def _get_data(self) -> list:
        out_path_train = self.root/self.out_filename

        if out_path_train.exists():
            train = load_language_modeling(out_path_train)
            dataset = train
        else:
            dataset = []
            with open(self.root/self.dirname, 'r', encoding='utf-8') as jfile:
                for item in tqdm(ijson.items(jfile, 'item')):
                    text = self._normalize(item['text']).strip()
                    samples = list(filter(lambda x: len(x) > 0, text.split('\n'))) # split document into sentences(len > 0)
                    dataset += samples
                    # If sample is a document, use below code not above two lines.
                    # sample = '\n'.join(list(filter(lambda x: len(x) > 0, text.split('\n'))))
                    # dataset.append(sample)
                    
            # Save dataset
            (self.root/self.dirname).unlink()
            save_language_modeling(dataset, to_path=out_path_train)
            
        return dataset
github EastonLee / Taobao_Crawler / taobao_crawler / spiders / taobao_2.py View on Github external
"""
        :return dicted js obj  g_page_config:
        :rtype :dict
        """
        if self.detect_anti_spider_from_response(response):
            logger.critical(anti_spider_breakpoit_msg)
            raise CloseSpider(anti_spider_breakpoit_msg)
            #sys.exit() #won't quit, just a exception
        # this name is from taobao page: https://s.taobao.com/search?q=空调
        g_page_config = ''
        for line in response.body.split('\n'):
            if 'g_page_config' in line:
                g_page_config = line.split('{', 1)[1].rsplit('}', 1)[0]
                break

        js_obj_gen = ijson.items(StringIO(''.join(('{', g_page_config, '}'))), '')
        js_obj = list(js_obj_gen)[0]

        if self.detect_anti_spider_from_js_obj(js_obj, response):
            logger.critical(anti_spider_breakpoit_msg)
            raise CloseSpider(anti_spider_breakpoit_msg)
        return js_obj
github tetherless-world / setlr / setlr / __init__.py View on Github external
def read_json(location, result):
    selector = result.value(api_vocab.selector)
    if selector is not None:
        selector = selector.value
    else:
        selector = ""
    with get_content(location, result) as fo:
        yield from enumerate(ijson.items(fo, selector))
github LearningRegistry / LearningRegistry / LR / lr / lib / stream.py View on Github external
def generator(self, instream):
        from ijson import items
        
        docs = items(instream, 'rows.item')
        self.generator_index = -1
        for doc in docs:
            self.generator_index += 1
            if self.docHandler != None and isinstance(self.docHandler, types.FunctionType):
                yield self.docHandler(doc)
            elif self.docHandler != None and isinstance(self.docHandler, CouchDBDocProcessor):
                yield self.docHandler.process(doc)
            else:
                yield doc
github LearningRegistry / LearningRegistry / LR / lr / lib / stream.py View on Github external
def parse(self, instream):
        from ijson import items
        
        docs = items(instream, 'rows.item')
        count = 0
        for doc in docs:
            count += 1
            if self.docHandler != None and isinstance(self.docHandler, types.FunctionType):
                self.docHandler(doc)
            elif self.docHandler != None and isinstance(self.docHandler, CouchDBDocProcessor):
                self.docHandler.process(doc)
    
            log.debug("DOC: %s" %(json.dumps(doc)))
        
        return count
github ContinuumIO / topik / topik / fileio / in_json.py View on Github external
...     u"Using the hybrid virtual leader and behavioral approach schema, the formation " +
    ...     u"control strategy by means of potential function is proposed. The overall strategy " +
    ...     u"has been successfully applied to the Quadrotor's model of Parrot AR Drone 2.0 in " +
    ...     u"Gazebo simulator programmed using Robot Operating System.\\nAuthor(s) Rizqi, A.A.A. " +
    ...     u"Dept. of Electr. Eng. & Inf. Technol., Univ. Gadjah Mada, Yogyakarta, Indonesia " +
    ...     u"Cahyadi, A.I. ; Adji, T.B.\\nReferenced Items are not available for this document.\\n" +
    ...     u"No versions found for this document.\\nStandards Dictionary Terms are available to " +
    ...     u"subscribers only.",
    ... u'uri': u'http://dig.isi.edu/autonomy/data/article/6871517',
    ... u'datePublished': u'2014',
    ... 'filename': '{}/test_data_large_json.json'.format(test_data_path)}
    True
    """

    with open(filename, 'r') as f:
        for item in ijson.items(f, json_prefix):
            if hasattr(item, 'keys'): # check if item is a dictionary
                item['filename'] = filename
                yield item
            # check if item is both iterable and not a string
            elif __is_iterable(item) and not isinstance(item, str):
                for sub_item in item:
                    # check if sub_item is a dictionary
                    if hasattr(sub_item, 'keys'):
                        sub_item['filename'] = filename
                        yield sub_item
            else:
                raise ValueError("'item' in json source is not a dict, and is either a string or not iterable: %r" % item)
github montaggroup / montag / pydbtool.py View on Github external
def do_import(args, db):
    pdb = db()
    friend_name = args.friend_name
    friend = pdb.get_friend_by_name(friend_name)
    if not friend:
        print >> sys.stderr, "No friend by that name, check your spelling or create a new friend using add_friend"
        return False
    friend_id = friend['id']

    print "Importing Authors"
    with open(args.file_name) as import_file:
        authors_to_insert = []
        author_docs = ijson.items(import_file, 'authors.item')
        for author_doc in author_docs:
            authors_to_insert.append(author_doc)
            if len(authors_to_insert) >= INSERT_BATCH_SIZE:
                print "."
                pdb.load_author_documents_from_friend(friend_id, authors_to_insert)
                authors_to_insert = []
        if authors_to_insert:
            pdb.load_author_documents_from_friend(friend_id, authors_to_insert)

    print "Importing Tomes"
    with open(args.file_name) as import_file:
        tomes_to_insert = []
        tome_docs = ijson.items(import_file, 'tomes.item')
        for tome_doc in tome_docs:
            tomes_to_insert.append(tome_doc)
            if len(tomes_to_insert) >= INSERT_BATCH_SIZE:
github RTXteam / RTX / code / kg2 / get_nodes_json_from_kg_json.py View on Github external
arg_parser.add_argument('--test', dest='test', action='store_true', default=False)
    arg_parser.add_argument('--inputFile', type=str, nargs=1)
    arg_parser.add_argument('--outputFile', type=str, nargs=1)
    return arg_parser


if __name__ == "__main__":
    args = make_arg_parser().parse_args()
    test_mode = args.test
    temp_file_name = tempfile.mkstemp(prefix="kg2-")[1]
    input_file_name = args.inputFile[0]

    nodes = []
    if input_file_name.endswith('.gz'):
            graph = gzip.GzipFile(input_file_name, 'r')
            for node in items(graph, "nodes"):
                nodes.append(node)
    else:
        with open(input_file_name, 'r') as graph:
            for node in items(graph, "nodes"):
                nodes.append(node)

    nodes = nodes[0]
    output_graph = {"nodes": nodes}
    output_file_name = args.outputFile[0]
    kg2_util.save_json(output_graph, output_file_name, test_mode)
github AaronTengDeChuan / pinkcom / scripts / convert_dstc8_data.py View on Github external
def create_train_file(train_file, train_file_out, opt):
    train_file_op = open(train_file_out, "w")
    positive_samples_count = 0
    negative_samples_count = 0
    aug_samples_count = 0

    train_data_handle = open(train_file, 'rb')
    json_data = ijson.items(train_data_handle, 'item')
    for index, entry in enumerate(json_data):
        if opt.heuristic_data_augmentation > 0:
            correct_answer_rows = get_aug_data(entry, min(opt.heuristic_data_augmentation + (opt.heuristic_data_augmentation * index - aug_samples_count), 9))
            aug_samples_count += len(correct_answer_rows)
        else:
            correct_answer_rows = []

        # row = str(index+1) + "\t"
        context = get_context(entry)
        row = context #+ "\t"

        if len(entry['options-for-correct-answers']) == 0:
            correct_answer = {}
            correct_answer['utterance'] = "None"
            target_id = "NONE"
        else:
github hochschule-darmstadt / openartbrowser / scripts / upload_to_elasticsearch / elasticsearch_helper.py View on Github external
except: # If not create
            es.snapshot.create_repository(repository=repository_name, body={"type": "fs", "settings": {"location": backup_directory}})
        es.snapshot.create(repository=repository_name,snapshot=snapshot_name, body={"indices": index_name})
        print("Snapshot created: " + snapshot_name)
        print("See all snapshots with GET /_cat/snapshots/openartbrowser_index_backup")
        print("If fallback to an old snapshot is required close the index with POST /" + index_name +"/_close")
        print("After this apply the snapshot, this will reopen the index POST /_snapshot/openartbrowser_index_backup/" + snapshot_name + "/_restore")
    else:
        es.indices.create(index=index_name) # Create if index not exists

    update_count = 0
    creation_count = 0
    delete_count = 0

    # Document creation
    for item in ijson.items(open(file, 'r', encoding='utf-8'), 'item'):
        # Search the index by the qid and type (only qId is not unique!)
        result = es.search(index=index_name, body={"query": {"bool": {"must": [{"match": {"id": item['id']}},{"match": {"type": item['type']}}]}}})
        result_length = len(result['hits']['hits'])
        if result_length == 1: # If exists update
            elastic_search_id = result['hits']['hits'][0]['_id']
            es.update(id=elastic_search_id, index=index_name, doc_type='data', body={ 'doc': item })
            update_count += 1
        elif result_length >= 2: # Remove current if it is a duplicate (sanity check should not occur)
            es.delete(id=elastic_search_id, index=index_name, doc_type='data')
            delete_count += 1
            #raise RuntimeError("There is a duplicate document in the index following qId: " + item['id']) ToDo: Comment in if there are problems with duplicates
        else:
            es.create(id=uuid.uuid4(), index=index_name, doc_type='data', body=item)
            creation_count += 1

    end = time.time()