This tool leverages OpenAI's GPT to generate and optimize SQL queries from natural language descriptions. Key features:
✅ Natural Language to SQL: Convert questions like "Show all employees in IT earning over $80k" into valid SQL.
✅ Self-Contained Jupyter Notebook: Install dependencies, configure API keys, and run queries in one place.
✅ Query Execution & Optimization: Execute generated SQL and receive GPT-suggested optimizations.
git clone https://github.com/AmmarAhmedl200961/automated-sql-generation-openai.git
Configure OpenAI API Key: Add your key to api.txt.
Run the Jupyter Notebook: Execute cells to generate SQL from natural language inputs.
Libraries setup
! pip install -q pip install openai prettytable python-dotenv pdfplumber
Api setup OpenAi
import openai with open ("api.txt") as f: openai.api_key = f.read().strip()
openai.models.list()
%%writefile input.txt Show me the top 5 highest-paid employees. Who is the youngest employee in the Engineering department? Find all employees who were hired in the year 2019. Show the names of employees whose salary is above the average salary in their department. Display the names of employees whose first and last names start with the same letter. Find the employee with the highest salary in each department
I have connected the program to a real database and
executed the generated SQL queries (using sqlite3) thus the program can fetch and display results
from the database as well
Table setup using sqlite3(completes extra credit II) and pdfplumber
import pdfplumber def extract_table_from_pdf(file_path): with pdfplumber.open(file_path) as pdf: last_page = pdf.pages[-1] # the table is on the last page table = last_page.extract_table() return table file_path = 'xxx-SQL.pdf' table_data = extract_table_from_pdf(file_path) for row in table_data: print(row)
If we provided complex multi-table queries or queries with
subqueries, my code can handle these effectively.
the table generated can be seen above, if more tables are needed they can be added in the sqlite3 db "employees.db" or otherwise
Creating sqlite3 db with many tables
import sqlite3 conn = ... cursor = conn.cursor() query = """ SELECT e.FirstName, e.LastName, d.DepartmentName FROM Employees e JOIN Departments d ON e.DepartmentID = d.DepartmentID WHERE e.Salary > ( SELECT AVG(Salary) FROM Employees WHERE DepartmentID = e.DepartmentID ) """ cursor = conn.execute(query) # display results...
in short only the reulting SCHEMA and query statements need to change, the LLM can handle generating sql and optimizing the query
undefined
import sqlite3 db_name = 'employees.db' def create_and_populate_db(db_name, data): # Connect to SQLite database conn = sqlite3.connect(db_name) cursor = conn.cursor() # Create table cursor.execute(''' CREATE TABLE IF NOT EXISTS employees ( EmployeeID INTEGER PRIMARY KEY, FirstName TEXT, LastName TEXT, Age INTEGER, Department TEXT, Position TEXT, Salary INTEGER, HireDate TEXT, ManagerID INTEGER ) ''') # skip the header row of the table prepared_data = [tuple(row) for row in data[1:]] # Insert data into table cursor.executemany('INSERT INTO employees VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', prepared_data) conn.commit() conn.close() create_and_populate_db(db_name, table_data) # show table conn = sqlite3.connect(db_name) cursor = conn.cursor() cursor.execute('SELECT * FROM employees') rows = cursor.fetchall() for row in rows: print(row)
Using LLM to generate SQLs
from openai import OpenAI from prettytable import PrettyTable import sqlite3 class DB: def __init__(self, DB_NAME): self.conn = sqlite3.connect(DB_NAME) self.cursor = self.conn.cursor() def execute(self, query): results = [] for statement in query.split(';'): if statement.strip(): self.cursor.execute(statement.strip()) results.extend(self.cursor.fetchall()) return results def terminate(self): self.conn.close() class OpenAIClient: def __init__(self, api_key): self.client = OpenAI(api_key=api_key) def generate_sql(self, question, schema): prompt=f""" f"Given the following SQL schema: {schema} Write a SQL query to {question}" """ response = self.client.completions.create( model="gpt-3.5-turbo-instruct", prompt=prompt, temperature=0, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0 ) sql_query = response.choices[0].text.strip() return sql_query def optimize_sql(self, sql_query): prompt=f""" Optimize this SQL query using using indexes, optimizing joins or other techniques. Only give the resulting sql, for the question: {sql_query} """ response = self.client.completions.create( model="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0 ) return response.choices[0].text.strip() class QuestionProcessor: def __init__(self, file_path): self.file_path = file_path def read_questions(self): with open(self.file_path, 'r') as file: questions = file.readlines() return [q.strip() for q in questions] class ResultDisplay: @staticmethod def display_results(cursor, question, sql_query, query_result, optimization): table = PrettyTable() table.field_names = ["Aspect", "Detail"] table.add_row(["Question", question]) table.add_row(["SQL Query", sql_query]) table.add_row(["Optimization Suggestion", optimization]) result_table = PrettyTable() result_table.field_names = [description[0] for description in cursor.description] for row in query_result: result_table.add_row(row) print(table) print("Query Results:") print(result_table) class Application: SCHEMA = """ CREATE TABLE employees ( EmployeeID INTEGER PRIMARY KEY, FirstName TEXT, LastName TEXT, Age INTEGER, Department TEXT, Position TEXT, Salary INTEGER, HireDate TEXT, ManagerID INTEGER ); """ def __init__(self, db_name, api_key, question_file): self.db_manager = DB(db_name) self.openai_client = OpenAIClient(api_key) self.question_processor = QuestionProcessor(question_file) def run(self): questions = self.question_processor.read_questions() for question in questions: sql_query = self.openai_client.generate_sql(question, self.SCHEMA) optimization = self.openai_client.optimize_sql(sql_query) query_result = self.db_manager.execute(sql_query) ResultDisplay.display_results(self.db_manager.cursor, question, sql_query, query_result, optimization) def close(self): self.db_manager.terminate() if __name__ == "__main__": with open("api.txt") as f: api_key = f.read().strip() app = Application(db_name, api_key, 'input.txt') try: app.run() finally: app.close()
4th question output seemed wrong i have tried to fix it with user input
def generate_sql(question): schema = Application.SCHEMA prompt = """ Given the following SQL schema: {schema} Example: To find the names of all employees in the 'Sales' department whose salary is above the department average, you would use: SELECT FirstName, LastName FROM employees WHERE Department = 'Sales' AND Salary > (SELECT AVG(Salary) FROM employees WHERE Department = 'Sales'); Task: Now, write a query based on the same pattern for the Engineering department: {question} """ client = OpenAI(api_key=api_key) response = client.completions.create( model="gpt-3.5-turbo-instruct", prompt=prompt, temperature=0.2, # Lower temperature to ensure more deterministic outputs max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0 ) sql_query = response.choices[0].text.strip() return sql_query def execute_sql(sql_query): # Splitting the query if needed and executing results = [] for statement in sql_query.split(';'): if statement.strip(): cursor.execute(statement.strip()) results.extend(cursor.fetchall()) return results # Example question for the task db = DB(db_name) cursor = db.cursor question = "Find the names of all employees in the 'Engineering' department whose salary is above the department average." db = DB('employees.db') cursor = db.cursor sql_query = generate_sql(question) results = db.execute(sql_query) optimization = OpenAIClient(api_key).optimize_sql(sql_query) ResultDisplay.display_results(cursor, question, sql_query, results, optimization) db.terminate()
Examining distribution of salary in engineering department (why do we get no results above)
# execute sql "SELECT AVG(Salary) AS AverageSalary, MIN(Salary) AS MinSalary, MAX(Salary) AS MaxSalary # FROM employees # WHERE Department = 'Engineering'; # " sql_query = "SELECT AVG(Salary) AS AverageSalary, MIN(Salary) AS MinSalary, MAX(Salary) AS MaxSalary FROM employees WHERE Department = 'Engineering';" results = DB('employees.db').execute(sql_query) print(results)
So this means this distribution for engineering department actually does not have employees with salaries above the average and thus returned no results.
5th question was solved wrongly here i have tried to fix it with user input
def generate_sql(question): schema = Application.SCHEMA prompt = f""" Given the following SQL schema: {schema} Example: To find the single employee with the highest salary in each department, you would use: SELECT Department, FirstName, LastName, Salary FROM ( SELECT *, ROW_NUMBER() OVER (PARTITION BY Department ORDER BY Salary DESC) AS rn FROM employees ) t WHERE rn = 1; Task: Now, write a query for the following request: {question} """ client = OpenAI(api_key=api_key) response = client.completions.create( model="gpt-3.5-turbo-instruct", prompt=prompt, temperature=0, max_tokens=150, top_p=1, frequency_penalty=0, presence_penalty=0 ) # skip first line as with this prompt we don't need the example query return response.choices[0].text.strip() # Example question for the task question = "Find the employee with the highest salary in each department" db = DB('employees.db') cursor = db.cursor sql_query = generate_sql(question) results = db.execute(sql_query) optimization = OpenAIClient(api_key).optimize_sql(sql_query) ResultDisplay.display_results(cursor, question, sql_query, results, optimization) db.terminate()
All good !