66 lines
2.3 KiB
Python
66 lines
2.3 KiB
Python
|
import random
|
||
|
import pandas as pd
|
||
|
from faker import Faker
|
||
|
|
||
|
fake = Faker()
|
||
|
|
||
|
def generate_student_data(num_students):
|
||
|
data = []
|
||
|
|
||
|
for _ in range(num_students):
|
||
|
# Generate basic student data
|
||
|
student_id = fake.unique.uuid4()
|
||
|
student_type = random.choice(['First-time Freshmen', 'Transfer Student', 'Graduate Student', 'Audit Student'])
|
||
|
major = fake.word().capitalize()
|
||
|
gpa = round(random.uniform(2.0, 4.0), 2)
|
||
|
sat_score = random.randint(800, 1600)
|
||
|
act_score = random.randint(1, 36)
|
||
|
state = fake.state()
|
||
|
zip_code = fake.zipcode()
|
||
|
country = fake.country()
|
||
|
first_generation = random.choice([True, False])
|
||
|
need_rank = random.randint(1, 6)
|
||
|
efc = random.randint(0, 100000) # Expected Family Contribution
|
||
|
financial_aid_package = random.randint(1000, 50000) # Financial aid package amount
|
||
|
|
||
|
# Published tuition price for calculation of discount rate
|
||
|
published_tuition_price = random.randint(30000, 70000)
|
||
|
discount_rate = round((financial_aid_package / published_tuition_price) * 100, 2)
|
||
|
|
||
|
# Enrollment funnel stages
|
||
|
funnel_stage = random.choice(['Lead', 'Prospect', 'Applied', 'Admitted', 'Waitlisted', 'Denied', 'Deposited', 'Enrolled'])
|
||
|
|
||
|
# Generate synthetic data point
|
||
|
student_data = {
|
||
|
'Student ID': student_id,
|
||
|
'Student Type': student_type,
|
||
|
'Major': major,
|
||
|
'GPA': gpa,
|
||
|
'SAT Score': sat_score,
|
||
|
'ACT Score': act_score,
|
||
|
'State': state,
|
||
|
'Zip Code': zip_code,
|
||
|
'Country': country,
|
||
|
'First Generation': first_generation,
|
||
|
'Need Rank': need_rank,
|
||
|
'EFC': efc,
|
||
|
'Financial Aid Package': financial_aid_package,
|
||
|
'Published Tuition Price': published_tuition_price,
|
||
|
'Discount Rate': discount_rate,
|
||
|
'Funnel Stage': funnel_stage
|
||
|
}
|
||
|
|
||
|
data.append(student_data)
|
||
|
|
||
|
return pd.DataFrame(data)
|
||
|
|
||
|
# Generate synthetic data for 1000 students
|
||
|
num_students = 1000
|
||
|
student_df = generate_student_data(num_students)
|
||
|
|
||
|
# Save the data to a CSV file
|
||
|
student_df.to_csv('synthetic_student_data.csv', index=False)
|
||
|
|
||
|
# Display the first few rows of the generated data
|
||
|
print(student_df.head())
|