Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
sys.exit(1)
# Sanity check that this is a file log
if 'files' not in args.bro_log:
print('This example only works with Zeek files.log files..')
sys.exit(1)
# File may have a tilde in it
if args.bro_log:
args.bro_log = os.path.expanduser(args.bro_log)
# Create a VirusTotal Query Class
vtq = vt_query.VTQuery()
# Run the bro reader on a given log file
reader = bro_log_reader.BroLogReader(args.bro_log, tail=True)
for row in reader.readrows():
file_sha = row.get('sha256', '-') # Zeek uses - for empty field
if file_sha == '-':
file_sha = row.get('sha1', '-') # Zeek uses - for empthy field
if file_sha == '-':
print('Should not find a sha256 or a sha1 key! Skipping...')
continue
# Make the query with either sha
results = vtq.query_file(file_sha)
if results.get('positives', 0) > 1: # At least two hits
pprint(results)
# Check for unknown args
if commands:
print('Unrecognized args: %s' % commands)
sys.exit(1)
# Sanity check that this is a ssl log
if 'ssl' not in args.bro_log:
print('This example only works with Zeek ssl.log files..')
sys.exit(1)
# File may have a tilde in it
if args.bro_log:
args.bro_log = os.path.expanduser(args.bro_log)
# Run the bro reader on the ssl.log file looking for potential Tor connections
reader = bro_log_reader.BroLogReader(args.bro_log, tail=args.t)
# Just a counter to keep an eye on how many possible Tor connections we identify
number = 0
# A empty list to use for the port statistics
ports = []
for row in reader.readrows():
# Add the destination port to the list of ports
ports.append(row['id.resp_p'])
# Pull out the Certificate Issuer
try:
issuer = row['issuer']
except KeyError:
print('Could not find the issuer field in your ssl.log. Please verify your log file.')
sys.exit(1)
# Check if the issuer matches the known Tor format
if issuer_regex.match(issuer):
if 'x509' not in args.bro_log:
print('This example only works with Zeek x509.log files..')
sys.exit(1)
# File may have a tilde in it
if args.bro_log:
args.bro_log = os.path.expanduser(args.bro_log)
# Create a VirusTotal Query Class
vtq = vt_query.VTQuery()
# These domains may be spoofed with a certificate issued by 'Let's Encrypt'
spoofed_domains = set(['paypal', 'gmail', 'google', 'apple','ebay', 'amazon'])
# Run the bro reader on the x509.log file looking for spoofed domains
reader = bro_log_reader.BroLogReader(args.bro_log, tail=True)
for row in reader.readrows():
# Pull out the Certificate Issuer
issuer = row['certificate.issuer']
if "Let's Encrypt" in issuer:
# Check if the certificate subject has any spoofed domains
subject = row['certificate.subject']
domain = subject[3:] # Just chopping off the 'CN=' part
if any([domain in subject for domain in spoofed_domains]):
print('\n<<< Suspicious Certificate Found >>>')
pprint(row)
# Make a Virus Total query with the spoofed domain (just for fun)
results = vtq.query_url(domain)
if results.get('positives', 0) >= 2: # At least two hits
if commands:
print('Unrecognized args: %s' % commands)
sys.exit(1)
# Sanity check that this is a http log
if 'http' not in args.bro_log:
print('This example only works with Zeek http.log files..')
sys.exit(1)
# File may have a tilde in it
if args.bro_log:
args.bro_log = os.path.expanduser(args.bro_log)
# Run the bro reader on a given log file counting up user agents
http_agents = Counter()
reader = bro_log_reader.BroLogReader(args.bro_log, tail=args.t)
for count, row in enumerate(reader.readrows()):
# Track count
http_agents[row['user_agent']] += 1
# Every hundred rows report agent counts (least common)
if not args.s:
if count%100==0:
print('\n<<>>')
pprint(http_agents.most_common()[:-50:-1])
# Also report at the end (if there is one)
print('\nLeast Common User Agents:')
pprint(http_agents.most_common()[:-50:-1])
sys.exit(1)
# File may have a tilde in it
if args.bro_log:
args.bro_log = os.path.expanduser(args.bro_log)
# Sanity check dns log
if 'dns' in args.bro_log:
log_type = 'dns'
else:
print('This example only works with Zeek with dns.log files..')
sys.exit(1)
# Create a Zeek log reader
print('Opening Data File: {:s}'.format(args.bro_log))
reader = bro_log_reader.BroLogReader(args.bro_log, tail=True)
# Create a Zeek IDS log live simulator
print('Opening Data File: {:s}'.format(args.bro_log))
reader = live_simulator.LiveSimulator(args.bro_log, eps=10) # 10 events per second
# Create a Dataframe Cache
df_cache = dataframe_cache.DataFrameCache(max_cache_time=600) # 10 minute cache
# Streaming Clustering Class
batch_kmeans = MiniBatchKMeans(n_clusters=5, verbose=True)
# Use the BroThon DataframeToMatrix class
to_matrix = dataframe_to_matrix.DataFrameToMatrix()
# Add each new row into the cache
time_delta = 10
def _get_field_info(self, log_filename):
"""Internal Method: Use ZAT log reader to read header for names and types"""
_bro_reader = bro_log_reader.BroLogReader(log_filename)
_, field_names, field_types, _ = _bro_reader._parse_bro_header(log_filename)
return field_names, field_types
def create_dataframe(self, log_filename, fillna=True):
""" Create a Spark dataframe from a Bro/Zeek log file
Args:
log_fllename (string): The full path to the Zeek log
fillna (bool): Fill in NA/NaN values (default=True)
"""
# Create a Zeek log reader just to read in the header for names and types
_bro_reader = bro_log_reader.BroLogReader(log_filename)
_, field_names, field_types, _ = _bro_reader._parse_bro_header(log_filename)
# Get the appropriate types for the Spark Dataframe
spark_schema = self.build_spark_schema(field_names, field_types)
# Now actually read the Zeek Log using Spark read CSV
_df = self.spark.read.csv(log_filename, schema=spark_schema, sep='\t', comment="#", nullValue='-')
''' Secondary processing (cleanup)
- Fix column names with '.' in them
- Fill in Nulls (optional)
- timestamp convert
- boolean convert
'''
# Fix column names
# For each file (may be just one) create a BroLogReader and use it
for self._filepath in self._files:
# Check if the file is zipped
tmp = None
if self._filepath.endswith('.gz'):
tmp = tempfile.NamedTemporaryFile(delete=False)
with gzip.open(self._filepath, 'rb') as f_in, open(tmp.name, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# Set the file path to the new temp file
self._filepath = tmp.name
# Create a BroLogReader
reader = bro_log_reader.BroLogReader(self._filepath)
for row in reader.readrows():
yield row
# Clean up any temp files
try:
if tmp:
os.remove(tmp.name)
print('Removed temporary file {:s}...'.format(tmp.name))
except IOError:
pass
try:
vtq = pickle.load(open('vtq.pkl', 'rb'))
print('Opening VirusTotal Query Cache (cache_size={:d})...'.format(vtq.size))
except IOError:
vtq = vt_query.VTQuery(max_cache_time=60*24*7) # One week cache
# See our 'Risky Domains' Notebook for the analysis and
# statistical methods used to compute this risky set of TLDs
risky_tlds = set(['info', 'tk', 'xyz', 'online', 'club', 'ru', 'website', 'in', 'ws',
'top', 'site', 'work', 'biz', 'name', 'tech', 'loan', 'win', 'pro'])
# Launch long lived process with signal catcher
with signal_utils.signal_catcher(save_vtq):
# Run the bro reader on the dns.log file looking for risky TLDs
reader = bro_log_reader.BroLogReader(args.bro_log)
for row in reader.readrows():
# Pull out the TLD
query = row['query']
tld = tldextract.extract(query).suffix
# Check if the TLD is in the risky group
if tld in risky_tlds:
# Make the query with the full query
results = vtq.query_url(query)
if results.get('positives', 0) > 3: # At least four hits
print('\nRisky Domain DNS Query Found')
print('From: {:s} To: {:s} QType: {:s} RCode: {:s}'.format(row['id.orig_h'],
row['id.resp_h'], row['qtype_name'], row['rcode_name']))
pprint(results)
def __init__(self, filepath, eps=10, max_rows=None, only_once=False):
"""Initialization for the LiveSimulator Class
Args:
eps (int): Events Per Second that the simulator will emit events (default = 10)
max_rows (int): The maximum number of rows to generate (default = None (go forever))
"""
# Compute EPS timer
# Logic:
# - Normal distribution centered around 1.0/eps
# - Make sure never less than 0
# - Precompute 1000 deltas and then just cycle around
self.eps_timer = itertools.cycle([max(0, delta) for delta in np.random.normal(1.0/float(eps), .5/float(eps), size=1000)])
# Initialize the Zeek log reader
self.log_reader = bro_log_reader.BroLogReader(filepath, tail=False)
# Store max_rows and only_once flag
self.max_rows = max_rows
self.only_once = only_once