import random import pandas as pd from faker import Faker fake = Faker() def generate_student_data(num_students): data = [] for _ in range(num_students): # Generate basic student data student_id = fake.unique.uuid4() student_type = random.choice(['First-time Freshmen', 'Transfer Student', 'Graduate Student', 'Audit Student']) major = fake.word().capitalize() gpa = round(random.uniform(2.0, 4.0), 2) sat_score = random.randint(800, 1600) act_score = random.randint(1, 36) state = fake.state() zip_code = fake.zipcode() country = fake.country() first_generation = random.choice([True, False]) need_rank = random.randint(1, 6) efc = random.randint(0, 100000) # Expected Family Contribution financial_aid_package = random.randint(1000, 50000) # Financial aid package amount # Published tuition price for calculation of discount rate published_tuition_price = random.randint(30000, 70000) discount_rate = round((financial_aid_package / published_tuition_price) * 100, 2) # Enrollment funnel stages funnel_stage = random.choice(['Lead', 'Prospect', 'Applied', 'Admitted', 'Waitlisted', 'Denied', 'Deposited', 'Enrolled']) # Generate synthetic data point student_data = { 'Student ID': student_id, 'Student Type': student_type, 'Major': major, 'GPA': gpa, 'SAT Score': sat_score, 'ACT Score': act_score, 'State': state, 'Zip Code': zip_code, 'Country': country, 'First Generation': first_generation, 'Need Rank': need_rank, 'EFC': efc, 'Financial Aid Package': financial_aid_package, 'Published Tuition Price': published_tuition_price, 'Discount Rate': discount_rate, 'Funnel Stage': funnel_stage } data.append(student_data) return pd.DataFrame(data) # Generate synthetic data for 1000 students num_students = 1000 student_df = generate_student_data(num_students) # Save the data to a CSV file student_df.to_csv('synthetic_student_data.csv', index=False) # Display the first few rows of the generated data print(student_df.head())