# Review the schema of your data
df.printSchema()
root
|-- step: integer (nullable = true)
|-- type: string (nullable = true)
|-- amount: double (nullable = true)
|-- nameOrig: string (nullable = true)
|-- oldbalanceOrg: double (nullable = true)
|-- newbalanceOrig: double (nullable = true)
|-- nameDest: string (nullable = true)
|-- oldbalanceDest: double (nullable = true)
|-- newbalanceDest: double (nullable = true)
from pyspark.sql import functions as F
# Rules to Identify Known Fraud-based
df = df.withColumn("label",
F.when(
(
(df.oldbalanceOrg <= 56900) & (df.type == "TRANSFER") & (df.newbalanceDest <= 105)) |
(
(df.oldbalanceOrg > 56900) & (df.newbalanceOrig <= 12)) |
(
(df.oldbalanceOrg > 56900) & (df.newbalanceOrig > 12) & (df.amount > 1160000)
), 1
).otherwise(0))
# Calculate proportions
fraud_cases = df.filter(df.label == 1).count()
total_cases = df.count()
fraud_pct = 1.*fraud_cases/total_cases
# Provide quick statistics
print("Based on these rules, we have flagged %s (%s) fraud cases out of a total of %s cases." % (fraud_cases, fraud_pct, total_cases))
# Create temporary view to review data
df.createOrReplaceTempView("financials_labeled")
Based on these rules, we have flagged 255640 (0.04017841706718302) fraud cases out of a total of 6362620 cases.
%sql
-- where sum(destDiff) >= 10000000.00
select nameOrig, nameDest, label, TotalOrgDiff, TotalDestDiff
from (
select nameOrig, nameDest, label, sum(OrgDiff) as TotalOrgDiff, sum(destDiff) as TotalDestDiff
from financials_labeled
group by nameOrig, nameDest, label
) a
where TotalDestDiff >= 1000000
limit 100
ScaDaMaLe Course site and book