university-data/discount-rate.py

66 lines
2.3 KiB
Python
Raw Permalink Normal View History

2024-06-10 18:29:30 -04:00
import random
import pandas as pd
from faker import Faker
fake = Faker()
def generate_student_data(num_students):
data = []
for _ in range(num_students):
# Generate basic student data
student_id = fake.unique.uuid4()
student_type = random.choice(['First-time Freshmen', 'Transfer Student', 'Graduate Student', 'Audit Student'])
major = fake.word().capitalize()
gpa = round(random.uniform(2.0, 4.0), 2)
sat_score = random.randint(800, 1600)
act_score = random.randint(1, 36)
state = fake.state()
zip_code = fake.zipcode()
country = fake.country()
first_generation = random.choice([True, False])
need_rank = random.randint(1, 6)
efc = random.randint(0, 100000) # Expected Family Contribution
financial_aid_package = random.randint(1000, 50000) # Financial aid package amount
# Published tuition price for calculation of discount rate
published_tuition_price = random.randint(30000, 70000)
discount_rate = round((financial_aid_package / published_tuition_price) * 100, 2)
# Enrollment funnel stages
funnel_stage = random.choice(['Lead', 'Prospect', 'Applied', 'Admitted', 'Waitlisted', 'Denied', 'Deposited', 'Enrolled'])
# Generate synthetic data point
student_data = {
'Student ID': student_id,
'Student Type': student_type,
'Major': major,
'GPA': gpa,
'SAT Score': sat_score,
'ACT Score': act_score,
'State': state,
'Zip Code': zip_code,
'Country': country,
'First Generation': first_generation,
'Need Rank': need_rank,
'EFC': efc,
'Financial Aid Package': financial_aid_package,
'Published Tuition Price': published_tuition_price,
'Discount Rate': discount_rate,
'Funnel Stage': funnel_stage
}
data.append(student_data)
return pd.DataFrame(data)
# Generate synthetic data for 1000 students
num_students = 1000
student_df = generate_student_data(num_students)
# Save the data to a CSV file
student_df.to_csv('synthetic_student_data.csv', index=False)
# Display the first few rows of the generated data
print(student_df.head())