Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def convert_ccgbank_to_json(ccgbankpath):
return TrainingDataCreator.convert_json(ccgbankpath)
def convert_json(autopath):
self = TrainingDataCreator(autopath, None, None, None)
trees = [tree for _, _, tree in read_ccgbank(self.filepath)]
logger.info(f'loaded {len(trees)} trees')
self._create_samples(trees)
return self.samples
def create_testdata(args):
self = TrainingDataCreator(args.PATH,
args.word_freq_cut,
args.cat_freq_cut,
args.char_freq_cut)
trees = [tree for _, _, tree in read_ccgbank(self.filepath)]
self._create_samples(trees)
with open(args.OUT / 'testdata.json', 'w') as f:
logger.info(f'writing to {f.name}')
json.dump(self.samples, f)
with open(args.OUT / 'testsents.txt', 'w') as f:
logger.info(f'writing to {f.name}')
for sent in self.sents:
print(sent, file=f)
help='only allow categories which appear >= freq-cut')
parser.add_argument('--word-freq-cut',
type=int,
default=5,
help='only allow words which appear >= freq-cut')
parser.add_argument('--char-freq-cut',
type=int,
default=5,
help='only allow characters which appear >= freq-cut')
parser.add_argument('--mode',
choices=['train', 'test'],
default='train')
args = parser.parse_args()
if args.mode == 'train':
TrainingDataCreator.create_traindata(args)
else:
TrainingDataCreator.create_testdata(args)
def create_traindata(args):
self = TrainingDataCreator(args.PATH,
args.word_freq_cut,
args.char_freq_cut,
args.cat_freq_cut)
trees = [tree for _, _, tree in read_ccgbank(self.filepath)]
for tree in trees:
self._traverse(tree)
self._create_samples(trees)
cats = {k: v for k, v in self.cats.items() if v >= self.cat_freq_cut}
self._write(cats, args.OUT / 'target.txt')
words = {k: v for k, v in self.words.items() if v >= self.word_freq_cut}
self._write(words, args.OUT / 'words.txt')
chars = {k: v for k, v in self.chars.items() if v >= self.char_freq_cut}
type=int,
default=5,
help='only allow words which appear >= freq-cut')
parser.add_argument('--char-freq-cut',
type=int,
default=5,
help='only allow characters which appear >= freq-cut')
parser.add_argument('--mode',
choices=['train', 'test'],
default='train')
args = parser.parse_args()
if args.mode == 'train':
TrainingDataCreator.create_traindata(args)
else:
TrainingDataCreator.create_testdata(args)