Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test():
"""Test for LogToSparkDF Class"""
import os
from zat.utils import file_utils
from pyspark.sql import SparkSession
# Spin up a local Spark Session (with 4 executors)
spark = SparkSession.builder.master('local[4]').appName('my_awesome').getOrCreate()
# Grab a test file
data_path = file_utils.relative_dir(__file__, '../data')
log_path = os.path.join(data_path, 'ftp.log')
# Convert it to a Spark DataFrame
log_to_spark = LogToSparkDF(spark)
spark_df = log_to_spark.create_dataframe(log_path)
# Print out the head
print(spark_df.show())
# Print out the datatypes
print(spark_df.printSchema())
num_rows = spark_df.count()
print("Number of Spark DataFrame rows: {:d}".format(num_rows))
columns = spark_df.columns
print("Columns: {:s}".format(','.join(columns)))
# Test a bunch
tests = ['app_stats.log', 'dns.log', 'http.log', 'notice.log', 'tor_ssl.log',
'conn.log', 'dhcp.log', 'dhcp_002.log', 'files.log', 'smtp.log', 'weird.log',
def log_to_parquet(log_in, parquet_out):
# Spin up a local Spark Session (with 4 executors)
spark = SparkSession.builder.master('local[4]').appName('my_awesome').getOrCreate()
# Use the ZAT class to load our log file into a Spark dataframe (2 lines of code!)
spark_it = log_to_sparkdf.LogToSparkDF(spark)
spark_df = spark_it.create_dataframe(log_in)
# Write it out as a parquet file
spark_df.write.parquet(parquet_out)
print('{:s} --> {:s}'.format(log_in, parquet_out))