Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_invalid(self):
for json in INVALID_JSONS:
# Yajl1 doesn't complain about additional data after the end
# of a parsed object. Skipping this test.
if self.__class__.__name__ == 'YajlParse' and json == YAJL1_PASSING_INVALID:
continue
with self.assertRaises(common.JSONError) as cm:
list(self.backend.basic_parse(BytesIO(json)))
def test_parse(self):
events = common.parse(basic_parse(BytesIO(JSON)))
events = [value
for prefix, event, value in events
if prefix == 'docs.item.meta.item.item'
]
self.assertEqual(events, [1])
def test_incomplete(self):
for json in INCOMPLETE_JSONS:
with self.assertRaises(common.IncompleteJSONError):
list(self.backend.basic_parse(BytesIO(json)))
def test_items(self):
events = basic_parse(BytesIO(JSON))
meta = list(common.items(common.parse(events), 'docs.item.meta'))
self.assertEqual(meta, [
[[1], {}],
{'key': 'value'},
None,
])
def test_scalar_builder(self):
builder = common.ObjectBuilder()
for event, value in basic_parse(BytesIO(SCALAR_JSON)):
builder.event(event, value)
self.assertEqual(builder.value, 0)
def test_object_builder(self):
builder = common.ObjectBuilder()
for event, value in basic_parse(BytesIO(JSON)):
builder.event(event, value)
self.assertEqual(builder.value, {
'docs': [
{
'string': 'строка - тест',
'null': None,
'boolean': False,
'true': True,
'integer': 0,
'double': Decimal('0.5'),
'exponent': 100,
'long': 10000000000,
},
{
'meta': [[1], {}],
def _get_data(self) -> list:
out_path_train = self.root/self.out_filename
if out_path_train.exists():
train = load_language_modeling(out_path_train)
dataset = train
else:
dataset = []
with open(self.root/self.dirname, 'r', encoding='utf-8') as jfile:
for item in tqdm(ijson.items(jfile, 'item')):
text = self._normalize(item['text']).strip()
samples = list(filter(lambda x: len(x) > 0, text.split('\n'))) # split document into sentences(len > 0)
dataset += samples
# If sample is a document, use below code not above two lines.
# sample = '\n'.join(list(filter(lambda x: len(x) > 0, text.split('\n'))))
# dataset.append(sample)
# Save dataset
(self.root/self.dirname).unlink()
save_language_modeling(dataset, to_path=out_path_train)
return dataset
"""
:return dicted js obj g_page_config:
:rtype :dict
"""
if self.detect_anti_spider_from_response(response):
logger.critical(anti_spider_breakpoit_msg)
raise CloseSpider(anti_spider_breakpoit_msg)
#sys.exit() #won't quit, just a exception
# this name is from taobao page: https://s.taobao.com/search?q=空调
g_page_config = ''
for line in response.body.split('\n'):
if 'g_page_config' in line:
g_page_config = line.split('{', 1)[1].rsplit('}', 1)[0]
break
js_obj_gen = ijson.items(StringIO(''.join(('{', g_page_config, '}'))), '')
js_obj = list(js_obj_gen)[0]
if self.detect_anti_spider_from_js_obj(js_obj, response):
logger.critical(anti_spider_breakpoit_msg)
raise CloseSpider(anti_spider_breakpoit_msg)
return js_obj
def counting_generator():
try:
for clk in ijson.items(raw_stream, 'clks.item'):
# Often the clients upload base64 strings with newlines
# We remove those here
raw = ''.join(clk.split('\n')).encode() + b'\n'
store['count'] += 1
store['totalbytes'] += len(raw)
yield raw
except ijson.common.IncompleteJSONError as e:
store['count'] = 0
app.logger.warning("Stopping as we have received incomplete json")
return
def counting_generator():
try:
for clk in ijson.items(raw_stream, 'clks.item'):
# Often the clients upload base64 strings with newlines
# We remove those here
raw = ''.join(clk.split('\n')).encode() + b'\n'
store['count'] += 1
store['totalbytes'] += len(raw)
yield raw
except ijson.common.IncompleteJSONError as e:
store['count'] = 0
app.logger.warning("Stopping as we have received incomplete json")
return