1.Agent 架构配置
2.测试数据库
采用开源的employees sample
3.资料库配置
ddl 从mysql里导出的
CREATE TABLE `departments` (
`dept_no` char(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`dept_name` varchar(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
PRIMARY KEY (`dept_no`) USING BTREE,
UNIQUE INDEX `dept_name`(`dept_name` ASC) USING BTREE
)
CREATE TABLE `dept_emp` (
`emp_no` int NOT NULL,
`dept_no` char(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`from_date` date NOT NULL,
`to_date` date NOT NULL,
PRIMARY KEY (`emp_no`, `dept_no`) USING BTREE,
INDEX `dept_no`(`dept_no` ASC) USING BTREE,
CONSTRAINT `dept_emp_ibfk_1` FOREIGN KEY (`emp_no`) REFERENCES `employees` (`emp_no`) ON DELETE CASCADE ON UPDATE RESTRICT,
CONSTRAINT `dept_emp_ibfk_2` FOREIGN KEY (`dept_no`) REFERENCES `departments` (`dept_no`) ON DELETE CASCADE ON UPDATE RESTRICT
)
CREATE TABLE `dept_manager` (
`emp_no` int NOT NULL,
`dept_no` char(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`from_date` date NOT NULL,
`to_date` date NOT NULL,
PRIMARY KEY (`emp_no`, `dept_no`) USING BTREE,
INDEX `dept_no`(`dept_no` ASC) USING BTREE,
CONSTRAINT `dept_manager_ibfk_1` FOREIGN KEY (`emp_no`) REFERENCES `employees` (`emp_no`) ON DELETE CASCADE ON UPDATE RESTRICT,
CONSTRAINT `dept_manager_ibfk_2` FOREIGN KEY (`dept_no`) REFERENCES `departments` (`dept_no`) ON DELETE CASCADE ON UPDATE RESTRICT
)
CREATE TABLE `employees` (
`emp_no` int NOT NULL,
`birth_date` date NOT NULL,
`first_name` varchar(14) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`last_name` varchar(16) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`gender` enum('M','F') CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`hire_date` date NOT NULL,
PRIMARY KEY (`emp_no`) USING BTREE
)
CREATE TABLE `salaries` (
`emp_no` int NOT NULL,
`salary` int NOT NULL,
`from_date` date NOT NULL,
`to_date` date NOT NULL,
PRIMARY KEY (`emp_no`, `from_date`) USING BTREE,
CONSTRAINT `salaries_ibfk_1` FOREIGN KEY (`emp_no`) REFERENCES `employees` (`emp_no`) ON DELETE CASCADE ON UPDATE RESTRICT
)
CREATE TABLE `titles` (
`emp_no` int NOT NULL,
`title` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`from_date` date NOT NULL,
`to_date` date NULL DEFAULT NULL,
PRIMARY KEY (`emp_no`, `title`, `from_date`) USING BTREE,
CONSTRAINT `titles_ibfk_1` FOREIGN KEY (`emp_no`) REFERENCES `employees` (`emp_no`) ON DELETE CASCADE ON UPDATE RESTRICT
)
db_description 这个是让大模型自己根据ddl生成的
db_description: The database consists of six tables: departments, dept_emp, dept_manager, employees, salaries, and titles.
departments: Stores department information, including a unique department ID (dept_no) and department name (dept_name).
dept_emp: Tracks employee assignments to departments, linking employees (emp_no) to departments (dept_no) with start (from_date) and end dates (to_date).
dept_manager: Records manager assignments for departments, linking managers (emp_no) to departments (dept_no) with start (from_date) and end dates (to_date).
employees: Contains employee details, including employee ID (emp_no), birth date, first name, last name, gender, and hire date.
salaries: Stores salary information for employees, linking salaries to employees (emp_no) with start (from_date) and end dates (to_date).
titles: Tracks employee job titles over time, linking titles to employees (emp_no) with start (from_date) and end dates (to_date).
qa_file 随便写的几个简单sql
| Get all employees' employee id and birth date, name, gender,hire date | SELECT emp_no,birth_date,first_name,last_name,gender,hire_date FROM employees; |
|---|---|
| Get all departments' department no and name | SELECT dept_no, dept_name FROM departments |
| Get employees' department | SELECT d., e. FROM departments d, employees e, dept_emp de where e.emp_no = de.emp_no AND d.dept_no = de.dept_no ORDER BY e.emp_no ASC |
| Get employees' title | SELECT t., e. FROM titles t, employees e where e.emp_no = t.emp_no |
| Get employee no 10001's manager | SELECT * from (SELECT dm.* from dept_manager dm , (SELECT d.dept_no from departments d, employees e, dept_emp de where d.dept_no = de.dept_no and de.emp_no = e.emp_no and e.emp_no = "10001") res where res.dept_no = dm.dept_no) res1, employees e WHERE res1.emp_no = e.emp_no |
| Get employees' salary | SELECT * from salaries s, employees e WHERE s.emp_no = e.emp_no |
4.测试case
| case | 执行情况 | 测试结果 |
|---|---|---|
| 简单的问题 | ||
| 公司员工有几人 | - 符合预期 | |
| sql SELECT COUNT(*) FROM employees; | ||
| 部门有几个 | - 符合预期 | |
| sql SELECT COUNT(*) FROM departments; | ||
| 部门领导有几个 | - 符合预期 | |
| sql SELECT COUNT(DISTINCT emp_no) AS manager_count FROM dept_manager; | ||
| 目前薪酬最高员工是谁 | - X符合预期结果虽然正确 但是sql 有问题会有bug,因为salary记录各个时间区间的工资,问题需要的是最新的工资,应该要加条件to_date = '9999-01-01' | |
| sql SELECT e.emp_no, e.first_name, e.last_name, s.salary FROM employees e JOIN salaries s ON e.emp_no = s.emp_no ORDER BY s.salary DESC LIMIT 1; | ||
| 尝试将正确的sql更新到QA里,重新尝试 | ||
| sql SELECT e.emp_no, e.first_name, e.last_name, s.salary FROM employees e JOIN salaries s ON e.emp_no = s.emp_no WHERE s.to_date = '9999-01-01' ORDER BY s.salary DESC LIMIT 1; | - 符合预期 | |
| 目前薪酬第二高员工是谁 | - 符合预期 | |
| sql SELECT e.emp_no, e.first_name, e.last_name, s.salary FROM employees e JOIN salaries s ON e.emp_no = s.emp_no WHERE s.to_date = '9999-01-01' ORDER BY s.salary DESC LIMIT 1 OFFSET 1; | ||
| 目前总薪酬最高的部门 | - 符合预期 | |
| sql SELECT d.dept_no, d.dept_name, SUM(s.salary) AS total_salary FROM departments d JOIN dept_emp de ON d.dept_no = de.dept_no JOIN salaries s ON de.emp_no = s.emp_no WHERE s.to_date = '9999-01-01' AND de.to_date = '9999-01-01' GROUP BY d.dept_no, d.dept_name ORDER BY total_salary DESC LIMIT 1; | ||
| 目前薪酬最高的领导 | - 符合预期 | |
| sql SELECT e.emp_no, e.first_name, e.last_name, s.salary FROM employees e JOIN salaries s ON e.emp_no = s.emp_no JOIN dept_manager dm ON e.emp_no = dm.emp_no WHERE s.to_date = '9999-01-01' AND dm.to_date = '9999-01-01' ORDER BY s.salary DESC LIMIT 1; | ||
| 复杂的问题 | ||
| 以年龄十年为区间,分析一下各区间段员工的目前收入情况 | - 符合预期 | |
| `sql SELECT FLOOR(DATEDIFF(CURDATE(), e.birth_date) / 365 / 10) * 10 AS age_range, AVG(s.salary) AS average_salary FROM employees e JOIN salaries s ON e.emp_no = s.emp_no WHERE s.to_date = '9999-01-01' GROUP BY age_range ORDER BY age_range; | ||
| 以部门维度,分析各部门的员工目前收入情况 | - 符合预期?sql执行结果没问题,分析估计字数超了,内网不限制token数量应该没问题 | |
| sql SELECT d.dept_no, d.dept_name, AVG(s.salary) AS avg_salary, MAX(s.salary) AS max_salary, MIN(s.salary) AS min_salary FROM departments d JOIN dept_emp de ON d.dept_no = de.dept_no JOIN salaries s ON de.emp_no = s.emp_no WHERE s.to_date = '9999-01-01' AND de.to_date = '9999-01-01' GROUP BY d.dept_no, d.dept_name; | ||
| 详细分析一下质量管理部的员工收入,性别,职务和年龄情况 | - X符合预期sql 没有问题但是由于数据量过于庞大,貌似就接收到30条数据? | |
| sql SELECT e.emp_no, e.first_name, e.last_name, e.gender, YEAR(CURDATE()) - YEAR(e.birth_date) AS age, t.title, s.salary FROM employees e JOIN dept_emp de ON e.emp_no = de.emp_no JOIN departments d ON de.dept_no = d.dept_no JOIN titles t ON e.emp_no = t.emp_no AND t.to_date = '9999-01-01' JOIN salaries s ON e.emp_no = s.emp_no AND s.to_date = '9999-01-01' WHERE d.dept_name = 'Quality Management'; | ||
| 尝试将sql执行结果导出,上传资料库去rag,尝试可行性。 | ||
| 不太行,rag原理是从资料库中抽取相关联的信息提供到模型prompt,这种需要全局大数据的资料分析问题解决不了的。 |
5.结论
ragflow生成sql质量总体还可以,如果要让模型自己去执行查询和处理后续数据的话,只适合处理简单的问题,数据量吞吐小的问题。生成的sql偶尔会有bug,需要收集之后,补充到资料库中之后,可以解决。