import random
import pandas as pd
from faker import Faker

fake = Faker()

def generate_student_data(num_students):
    data = []

    for _ in range(num_students):
        # Generate basic student data
        student_id = fake.unique.uuid4()
        student_type = random.choice(['First-time Freshmen', 'Transfer Student', 'Graduate Student', 'Audit Student'])
        major = fake.word().capitalize()
        gpa = round(random.uniform(2.0, 4.0), 2)
        sat_score = random.randint(800, 1600)
        act_score = random.randint(1, 36)
        state = fake.state()
        zip_code = fake.zipcode()
        country = fake.country()
        first_generation = random.choice([True, False])
        need_rank = random.randint(1, 6)
        efc = random.randint(0, 100000)  # Expected Family Contribution
        financial_aid_package = random.randint(1000, 50000)  # Financial aid package amount

        # Published tuition price for calculation of discount rate
        published_tuition_price = random.randint(30000, 70000)
        discount_rate = round((financial_aid_package / published_tuition_price) * 100, 2)

        # Enrollment funnel stages
        funnel_stage = random.choice(['Lead', 'Prospect', 'Applied', 'Admitted', 'Waitlisted', 'Denied', 'Deposited', 'Enrolled'])

        # Generate synthetic data point
        student_data = {
            'Student ID': student_id,
            'Student Type': student_type,
            'Major': major,
            'GPA': gpa,
            'SAT Score': sat_score,
            'ACT Score': act_score,
            'State': state,
            'Zip Code': zip_code,
            'Country': country,
            'First Generation': first_generation,
            'Need Rank': need_rank,
            'EFC': efc,
            'Financial Aid Package': financial_aid_package,
            'Published Tuition Price': published_tuition_price,
            'Discount Rate': discount_rate,
            'Funnel Stage': funnel_stage
        }

        data.append(student_data)

    return pd.DataFrame(data)

# Generate synthetic data for 1000 students
num_students = 1000
student_df = generate_student_data(num_students)

# Save the data to a CSV file
student_df.to_csv('synthetic_student_data.csv', index=False)

# Display the first few rows of the generated data
print(student_df.head())