Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
njobs = options.crf_njobs,
max_iter = options.max_iter,
inference_cache = options.crf_inference_cache)
if options.rm:
doer.rm()
return
lTrn, lTst, lRun, lFold = [_checkFindColDir(lsDir, bAbsolute=False) for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold]]
# if options.bAnnotate:
# doer.annotateDocument(lTrn)
# traceln('annotation done')
# sys.exit(0)
traceln("- classes: ", doer.getGraphClass().getLabelNameList())
## use. a_mpxml files
#doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern
if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
if options.iFoldInitNum:
"""
initialization of a cross-validation
"""
splitter, ts_trn, lFilename_trn = doer._nfold_Init(lFold, options.iFoldInitNum, test_size=0.25, random_state=None, bStoreOnDisk=True)
elif options.iFoldRunNum:
"""
Run one fold
"""
oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm, options.pkl)
def testDirInit(self, dir):
self.testFolder = dir
self.testDirXML = os.path.join(dir, "xml")
self.testDirRUN = os.path.join(dir, "run")
self.testDirREF = os.path.join(dir, "ref")
self.testDirOUT = os.path.join(dir, "out") #only used if the --out modifier has been specified to keep the output
self.testConfigFile = os.path.join(dir, self._variantName+configFileBaseName+extXML)
traceln("- config file is: ", self.testConfigFile)
#@optional param filename: where to store the result of the process (not the result of the test)
# Note: for ascendent compatibility, we inspect self to determine if the optional parameter is supported or not.
# (old component do not have this capability)
if os.path.isdir(self.testDirXML) or os.path.isdir(self.testDirRUN) or os.path.isdir(self.testDirREF):
if os.path.isdir(self.testDirXML) and os.path.isdir(self.testDirRUN) and os.path.isdir(self.testDirREF):
if not os.path.isfile(self.testConfigFile):
raise ComponentException("Config file missing: %s"%self.testConfigFile)
else:
raise ComponentException("Invalid test structure, needs the 'xml', 'run', 'ref' directories!")
else:
#ok, let's create everything
assert False, "Internal error"
#no more processing!!
exit(0)
#-------------------
if lFold:
loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
traceln("Results are in %s"%sReportPickleFilename)
graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
elif lTrn:
doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for CutSearch
except: pass
traceln(" --- CRF Model ---")
traceln(doer.getModel().getModelInfo())
elif lTst:
doer.load()
tstReport = doer.test(lTst)
traceln(tstReport)
if options.bDetailedReport:
traceln(tstReport.getDetailledReport())
sReportPickleFilename = os.path.join(sModelDir, sModelName + "__detailled_report.txt")
graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, tstReport)
if lRun:
if options.storeX or options.applyY:
try: doer.load()
except: pass #we only need the transformer
lsOutputFilename = doer.runForExternalMLMethod(lRun, options.storeX, options.applyY, options.bRevertEdges)
else:
doer.load()
if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
if options.iFoldInitNum:
"""
initialization of a cross-validation
"""
splitter, ts_trn, lFilename_trn = doer._nfold_Init(lFold, options.iFoldInitNum, test_size=None, random_state=None, bStoreOnDisk=True)
elif options.iFoldRunNum:
"""
Run one fold
"""
oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm, options.pkl)
traceln(oReport)
elif options.bFoldFinish:
tstReport = doer._nfold_Finish()
traceln(tstReport)
else:
assert False, "Internal error"
#no more processing!!
exit(0)
#-------------------
if lFold:
loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
import graph.GraphModel
sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
traceln("Results are in %s"%sReportPickleFilename)
graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
elif lTrn:
doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for GridSearch
except: pass
if lFold:
loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
import graph.GraphModel
sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
traceln("Results are in %s"%sReportPickleFilename)
graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
elif lTrn:
doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for GridSearch
except: pass
traceln(" --- CRF Model ---")
traceln(doer.getModel().getModelInfo())
elif lTst:
doer.load()
tstReport = doer.test(lTst)
traceln(tstReport)
if options.bDetailedReport:
traceln(tstReport.getDetailledReport())
sReportPickleFilename = os.path.join(sModelDir, sModelName + "__detailled_report.txt")
graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, tstReport)
if lRun:
if options.storeX or options.applyY:
try: doer.load()
except: pass #we only need the transformer
lsOutputFilename = doer.runForExternalMLMethod(lRun, options.storeX, options.applyY, options.bRevertEdges)
else:
doer.load()
lsOutputFilename = doer.predict(lRun)
traceln("Done, see in:\n %s"%lsOutputFilename)
loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
traceln("Results are in %s"%sReportPickleFilename)
graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
elif lTrn:
doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for GridSearch
except: pass
traceln(" --- CRF Model ---")
traceln(doer.getModel().getModelInfo())
elif lTst:
doer.load()
tstReport = doer.test(lTst)
traceln(tstReport)
if options.bDetailedReport:
traceln(tstReport.getDetailledReport())
sReportPickleFilename = os.path.join(sModelDir, sModelName + "__detailled_report.txt")
graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, tstReport)
if lRun:
if options.storeX or options.applyY:
try: doer.load()
except: pass #we only need the transformer
lsOutputFilename = doer.runForExternalMLMethod(lRun, options.storeX, options.applyY, options.bRevertEdges)
else:
doer.load()
lsOutputFilename = doer.predict(lRun)
traceln("Done, see in:\n %s"%lsOutputFilename)
self.W_classif = tf.Variable(tf.random_uniform((2*self.node_indim, self.n_classes),
-1.0 / math.sqrt(self.node_dim),
1.0 / math.sqrt(self.node_dim)),
name="W_classif",dtype=np.float32)
self.B_classif = tf.Variable(tf.zeros([self.n_classes]), name='B_classif',dtype=np.float32)
self.train_var.append((self.W_classif))
self.train_var.append((self.B_classif))
self.node_dropout_ind = tf.nn.dropout(tf.ones([self.nb_node], dtype=tf.float32), 1 - self.dropout_p_node)
self.ND = tf.diag(self.node_dropout_ind)
edge_dropout = self.dropout_rate_edge> 0.0 or self.dropout_rate_edge_feat > 0.0
traceln(' -- Edge Dropout',edge_dropout, self.dropout_rate_edge,self.dropout_rate_edge_feat)
if self.num_layers==1:
self.H = self.activation(tf.add(tf.matmul(self.node_input, self.Wnl0), self.Bnl0))
self.hidden_layers = [self.H]
traceln(" -- H shape",self.H.get_shape())
P = self.fastconvolve(self.Wel0,self.Bel0,self.F,self.Ssparse,self.Tsparse,self.H,self.nconv_edge,self.Sshape,self.nb_edge,
self.dropout_p_edge,self.dropout_p_edge_feat,stack=self.stack_instead_add,use_dropout=edge_dropout,
use_attention=self.sum_attention
)
Hp = tf.concat([self.H, P], 1)
Hi=self.activation(Hp)
self.hidden_layers.append(Hi)
elif self.num_layers>1:
def processPage(self,page):
from util.XYcut import mergeSegments
### skrinking to be done:
lCuts, x1, x2 = mergeSegments([(x.getX(),x.getX()+20,x) for x in page.getAllNamedObjects(XMLDSTEXTClass)],0)
for x,y,cut in lCuts:
ll =list(cut)
ll.sort(key=lambda x:x.getY())
traceln(len(ll))
# traceln (list(map(lambda x:x.getContent(),ll)))
Get the DOM, the DOM page node, the page object
iterator on the DOM, that returns nodes (of class Block)
"""
#--- XPATH contexts
assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page
for ndBlock in lNdBlock:
domid = ndBlock.get("id")
sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
if sText == None:
sText = ""
NodeType_PageXml.nbNoTextWarning += 1
if NodeType_PageXml.nbNoTextWarning < 33:
traceln("Warning: no text in node %s"%domid)
elif NodeType_PageXml.nbNoTextWarning == 33:
traceln("Warning: no text in node %s - *** %d repetition : I STOP WARNING ***" % (domid, NodeType_PageXml.nbNoTextWarning))
#raise ValueError, "No text in node: %s"%ndBlock
#now we need to infer the bounding box of that object
lXY = PageXml.getPointList(ndBlock) #the polygon
if lXY == []:
continue
plg = Polygon(lXY)
try:
x1,y1, x2,y2 = plg.fitRectangle()
except ZeroDivisionError:
# traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
# continue
# if True:
"""
Compute:
- edges between each block and the cut line above/across/below the block
- edges between cut lines
return a list of edges
"""
#augment the block with the coordinate of its baseline central point
for blk in lClassicPageNode:
try:
x,y = BaselineCutAnnotator.getDomBaselineXY(blk.node)
blk.x_bslne = x
blk.y_bslne = y
except IndexError:
traceln("** WARNING: no Baseline in ", blk.domid)
traceln("** Using x2 and y2 instead... :-/")
blk.x_bslne = blk.x2
blk.y_bslne = blk.y2
for cutBlk in lSpecialPageNode:
assert cutBlk.y1 == cutBlk.y2
cutBlk.y1 = int(round(cutBlk.y1)) #DeltaFun make float
cutBlk.y2 = cutBlk.y1
#block to cut line edges
lEdge = []
for blk in lClassicPageNode:
for cutBlk in lSpecialPageNode:
if blk.y_bslne == cutBlk.y1:
edge = Edge_BL(blk, cutBlk)