From 1c1792e204861c2e887f89f02309fdf44167488c Mon Sep 17 00:00:00 2001 From: zk Date: Thu, 19 Mar 2026 10:55:30 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=B2=97=E4=BD=8D=E6=A0=87?= =?UTF-8?q?=E7=AD=BE=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .kiro/steering/数据清洗方案.md | 47 ++++++++++- .kiro/steering/项目结构说明.md | 3 + .../java/org/jiayunet/ai/AiChatAbility.java | 1 - .../mapper/JobSkillTagRelationMapper.java | 13 +++ .../jiayunet/pojo/po/JobSkillTagRelation.java | 31 +++++++ .../jiayunet/service/DictCacheService.java | 55 ++++++++++-- .../org/jiayunet/service/JobCleanService.java | 84 ++++++++++++++++++- .../service/JobCleanTransactionService.java | 41 +++++++-- 8 files changed, 253 insertions(+), 22 deletions(-) create mode 100644 manager/src/main/java/org/jiayunet/mapper/JobSkillTagRelationMapper.java create mode 100644 manager/src/main/java/org/jiayunet/pojo/po/JobSkillTagRelation.java diff --git a/.kiro/steering/数据清洗方案.md b/.kiro/steering/数据清洗方案.md index 199bac9..8e6fdf4 100644 --- a/.kiro/steering/数据清洗方案.md +++ b/.kiro/steering/数据清洗方案.md @@ -94,10 +94,11 @@ UPDATE app_job_data SET clean_status=0 WHERE clean_status=1 AND updated_at < NOW ### 2.2 参考数据准备 应用启动时加载并缓存(定期刷新): -- `bg_job_category` 全量:拼成 `id:name` 文本列表 -- `bg_industry` 全量:拼成 `id:name` 文本列表 +- `bg_job_category` 全量:只取叶子节点(level=3),拼成 `id:name(一级/二级)` 文本列表 +- `bg_industry` 全量:只取叶子节点(level=2),拼成 `id:name(一级)` 文本列表 +- `bg_skill_tag` 全量:按 `categoryId` 分组缓存为 `Map>`,供第二次 AI 调用使用 -这两份列表作为 prompt 的一部分传给AI,ID由人工维护为短数字,不使用雪花ID。 +分类和行业列表作为 prompt 的一部分传给AI,ID由人工维护为短数字,不使用雪花ID。 地区数据(`bg_china_regions_code`)不传给AI,由Java侧根据AI返回的城市名自行匹配。 @@ -162,7 +163,41 @@ UPDATE app_job_data SET clean_status=0 WHERE clean_status=1 AND updated_at < NOW 步骤 2-7 放在一个短事务中,保证数据一致性。 -### 2.5 设计决策记录 +### 2.5 技能标签匹配(第二次 AI 调用) + +`bg_skill_tag` 是预定义的技能标签池,挂在岗位类型下,用于岗位-简历匹配度计算。与 `bg_job.skill_tags`(自由文本,展示用)是两套东西。 + +#### 关联表 + +```sql +CREATE TABLE bg_job_skill_tag_relation ( + id BIGINT NOT NULL, + job_id BIGINT NOT NULL COMMENT '岗位ID', + skill_tag_id BIGINT NOT NULL COMMENT '技能标签ID', + create_time DATETIME NOT NULL COMMENT '创建时间', + PRIMARY KEY (id), + INDEX idx_job_id (job_id), + INDEX idx_skill_tag_id (skill_tag_id), + UNIQUE INDEX uk_job_skill (job_id, skill_tag_id) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='岗位-技能标签关联表'; +``` + +#### 流程 + +1. 第一次 AI 调用完成后,拿到 `categoryId` +2. 从缓存取该 `categoryId` 下的 skillTag 列表,为空则跳过 +3. 发起第二次 AI 调用:传岗位标题+职责+要求 + 该分类下的标签列表(`id:name`),AI 返回匹配的标签 ID 数组 +4. 校验返回的 ID 确实存在于该 categoryId 的标签池中(防幻觉) +5. 写入 `bg_job_skill_tag_relation`(在 saveJobData 事务中一并写入) +6. 第二次 AI 调用失败不影响岗位入库,仅日志记录 + +#### prompt 规则 + +- 只能从给定列表中选择,不允许自创 +- 不限制个数,不重复即可 +- 只返回 ID 数组,如 `[1, 3, 7]` + +### 2.6 设计决策记录 | 决策点 | 结论 | 原因 | |--------|------|------| @@ -175,6 +210,10 @@ UPDATE app_job_data SET clean_status=0 WHERE clean_status=1 AND updated_at < NOW | skillTags 数量 | 最多8个 | 控制数量,保持精炼 | | source_id 取值 | app_job_data.id | 简单直接,用于去重 | | 公司不存在时 | 自动创建 status=0 待完善 | 后续由公司数据补充逻辑完善 | +| skillTag 存储方式 | 独立关联表 bg_job_skill_tag_relation | 百万级数据量,JSON查询性能差 | +| skillTag 匹配时机 | 第二次 AI 调用,岗位入库后 | 需要先确定 categoryId 才能缩小标签范围 | +| skillTag 与 skill_tags 的关系 | 两套独立数据 | skill_tags 是展示用自由文本,skillTag 是计算用预定义标签 | +| 第二次 AI 失败是否影响入库 | 不影响 | 标签匹配是增强功能,不阻断主流程 | --- diff --git a/.kiro/steering/项目结构说明.md b/.kiro/steering/项目结构说明.md index 2d2a5cd..3890cd7 100644 --- a/.kiro/steering/项目结构说明.md +++ b/.kiro/steering/项目结构说明.md @@ -70,6 +70,7 @@ offerpie/back-end │ ├─ CompanyMapper.java # 公司Mapper │ ├─ JobMapper.java # 岗位Mapper │ ├─ JobRegionRelationMapper.java # 岗位-地区关联Mapper + │ ├─ JobSkillTagRelationMapper.java # 岗位-技能标签关联Mapper │ ├─ IndustryMapper.java # 行业Mapper │ ├─ SkillTagMapper.java # 技能标签Mapper │ ├─ UserJobFavoriteMapper.java # 用户收藏岗位Mapper @@ -91,6 +92,7 @@ offerpie/back-end │ │ ├─ Company.java # 公司表(bg_company) │ │ ├─ Job.java # 岗位表(bg_job) │ │ ├─ JobRegionRelation.java # 岗位-地区关联表(bg_job_region_relation) + │ │ ├─ JobSkillTagRelation.java # 岗位-技能标签关联表(bg_job_skill_tag_relation) │ │ ├─ Industry.java # 行业字典表(bg_industry) │ │ ├─ SkillTag.java # 技能标签表(bg_skill_tag) │ │ ├─ UserJobFavorite.java # 用户收藏岗位表(bg_user_job_favorite) @@ -133,6 +135,7 @@ offerpie/back-end | `Industry` | manager | 行业字典表(bg_industry),树形结构,一级/二级分类。 | | `SkillTag` | manager | 技能标签表(bg_skill_tag),挂在岗位类型下,不分级,用于匹配度计算。 | | `UserJobDislike` | manager | 用户不感兴趣记录表(bg_user_job_dislike),记录用户对岗位的不感兴趣原因,冗余公司ID/地区编码/行业ID方便推荐过滤。 | +| `JobSkillTagRelation` | manager | 岗位-技能标签关联表(bg_job_skill_tag_relation),预定义技能标签与岗位的关联,用于匹配度计算。 | | `AppJobData` | manager | 爬虫岗位原始数据表(app_job_data),存储爬虫抓取的原始岗位数据,供清洗服务读取并写入业务表。 | ## 4️⃣ 权限体系设计 diff --git a/common/src/main/java/org/jiayunet/ai/AiChatAbility.java b/common/src/main/java/org/jiayunet/ai/AiChatAbility.java index 76b1414..d91c42b 100644 --- a/common/src/main/java/org/jiayunet/ai/AiChatAbility.java +++ b/common/src/main/java/org/jiayunet/ai/AiChatAbility.java @@ -49,7 +49,6 @@ public class AiChatAbility { } String url = config.getBaseUrl() + "/chat/completions"; - log.info("AI 请求 URL: {}, model: {}", url, config.getModel()); Map body = new HashMap<>(); body.put("model", config.getModel()); diff --git a/manager/src/main/java/org/jiayunet/mapper/JobSkillTagRelationMapper.java b/manager/src/main/java/org/jiayunet/mapper/JobSkillTagRelationMapper.java new file mode 100644 index 0000000..e3e3b25 --- /dev/null +++ b/manager/src/main/java/org/jiayunet/mapper/JobSkillTagRelationMapper.java @@ -0,0 +1,13 @@ +package org.jiayunet.mapper; + +import org.apache.ibatis.annotations.Mapper; +import org.jiayunet.pojo.po.JobSkillTagRelation; + +/** + * 岗位-技能标签关联Mapper + * + * @author zk + */ +@Mapper +public interface JobSkillTagRelationMapper extends CommonMapper { +} diff --git a/manager/src/main/java/org/jiayunet/pojo/po/JobSkillTagRelation.java b/manager/src/main/java/org/jiayunet/pojo/po/JobSkillTagRelation.java new file mode 100644 index 0000000..5ef6cdd --- /dev/null +++ b/manager/src/main/java/org/jiayunet/pojo/po/JobSkillTagRelation.java @@ -0,0 +1,31 @@ +package org.jiayunet.pojo.po; + +import com.baomidou.mybatisplus.annotation.IdType; +import com.baomidou.mybatisplus.annotation.TableId; +import com.baomidou.mybatisplus.annotation.TableName; +import lombok.Data; + +import java.time.Instant; + +/** + * 岗位-技能标签关联表(bg_job_skill_tag_relation) + *

预定义技能标签与岗位的关联,用于匹配度计算

+ * + * @author zk + */ +@Data +@TableName(value = "bg_job_skill_tag_relation") +public class JobSkillTagRelation { + + @TableId(type = IdType.ASSIGN_ID) + private Long id; + + /** 岗位ID */ + private Long jobId; + + /** 技能标签ID */ + private Long skillTagId; + + /** 创建时间 */ + private Instant createTime; +} diff --git a/manager/src/main/java/org/jiayunet/service/DictCacheService.java b/manager/src/main/java/org/jiayunet/service/DictCacheService.java index e7350b2..ee432ad 100644 --- a/manager/src/main/java/org/jiayunet/service/DictCacheService.java +++ b/manager/src/main/java/org/jiayunet/service/DictCacheService.java @@ -5,22 +5,26 @@ import lombok.extern.slf4j.Slf4j; import org.jiayunet.mapper.ChinaRegionsCodeMapper; import org.jiayunet.mapper.IndustryMapper; import org.jiayunet.mapper.JobCategoryMapper; +import org.jiayunet.mapper.SkillTagMapper; import org.jiayunet.pojo.po.ChinaRegionsCode; import org.jiayunet.pojo.po.Industry; import org.jiayunet.pojo.po.JobCategory; +import org.jiayunet.pojo.po.SkillTag; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import javax.annotation.PostConstruct; +import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; /** * 字典数据缓存服务 - *

启动时加载岗位分类、行业、地区数据到内存,供清洗/推荐等业务使用

- *

依赖:JobCategoryMapper、IndustryMapper、ChinaRegionsCodeMapper

- *

使用表:bg_job_category(全量缓存)、bg_industry(全量缓存)、bg_china_regions_code(市级缓存)

+ *

启动时加载岗位分类、行业、地区、技能标签数据到内存,供清洗/推荐等业务使用

+ *

依赖:JobCategoryMapper、IndustryMapper、ChinaRegionsCodeMapper、SkillTagMapper

+ *

使用表:bg_job_category(全量缓存)、bg_industry(全量缓存)、bg_china_regions_code(市级缓存)、bg_skill_tag(按categoryId分组缓存)

* * @author zk */ @@ -37,10 +41,16 @@ public class DictCacheService { @Autowired private ChinaRegionsCodeMapper chinaRegionsCodeMapper; + @Autowired + private SkillTagMapper skillTagMapper; + private List jobCategoryList; private List industryList; private List regionList; + /** 技能标签按 categoryId 分组 */ + private Map> skillTagMap; + /** 岗位分类文本(叶子节点,带父级路径),供 AI prompt 使用 */ private String jobCategoryText; /** 行业文本(叶子节点,带父级路径),供 AI prompt 使用 */ @@ -89,8 +99,15 @@ public class DictCacheService { long categoryLeafCount = jobCategoryList.stream().filter(c -> c.getLevel() == 3).count(); long industryLeafCount = industryList.stream().filter(i -> i.getLevel() == 2).count(); - log.info("字典缓存加载完成: 岗位分类{}条(叶子{}条), 行业{}条(叶子{}条), 地区{}条", - jobCategoryList.size(), categoryLeafCount, industryList.size(), industryLeafCount, regionList.size()); + + // 加载技能标签,按 categoryId 分组 + List skillTagList = skillTagMapper.selectList(null); + skillTagMap = skillTagList.stream() + .collect(Collectors.groupingBy(SkillTag::getCategoryId)); + + log.info("字典缓存加载完成: 岗位分类{}条(叶子{}条), 行业{}条(叶子{}条), 地区{}条, 技能标签{}条(覆盖{}个分类)", + jobCategoryList.size(), categoryLeafCount, industryList.size(), industryLeafCount, + regionList.size(), skillTagList.size(), skillTagMap.size()); } /** 获取岗位分类文本(叶子节点,带父级路径,逗号分隔) */ @@ -103,6 +120,34 @@ public class DictCacheService { return industryText; } + /** + * 获取指定岗位类型下的技能标签文本(id:name 逗号分隔) + * + * @param categoryId 岗位类型ID + * @return 标签文本,无标签返回 null + */ + public String getSkillTagText(Long categoryId) { + List tags = skillTagMap.getOrDefault(categoryId, Collections.emptyList()); + if (tags.isEmpty()) { + return null; + } + return tags.stream() + .map(t -> t.getId() + ":" + t.getName()) + .collect(Collectors.joining(", ")); + } + + /** + * 获取指定岗位类型下的技能标签ID集合(用于校验AI返回) + * + * @param categoryId 岗位类型ID + * @return 标签ID集合 + */ + public Set getSkillTagIds(Long categoryId) { + return skillTagMap.getOrDefault(categoryId, Collections.emptyList()).stream() + .map(SkillTag::getId) + .collect(Collectors.toSet()); + } + /** * 根据城市名匹配地区编码 *

模糊匹配,如"北京"匹配"北京市"

diff --git a/manager/src/main/java/org/jiayunet/service/JobCleanService.java b/manager/src/main/java/org/jiayunet/service/JobCleanService.java index 6e099ec..f6cdde9 100644 --- a/manager/src/main/java/org/jiayunet/service/JobCleanService.java +++ b/manager/src/main/java/org/jiayunet/service/JobCleanService.java @@ -18,6 +18,7 @@ import org.springframework.stereotype.Service; import java.time.Instant; import java.util.ArrayList; import java.util.List; +import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -66,10 +67,15 @@ public class JobCleanService { * 定时任务A:岗位清洗(每5分钟) *

1. 批量锁定待清洗数据 2. 多线程并发调用AI清洗 3. 写入业务表

*/ - @Scheduled(cron = "0 */1 * * * ?") + @Scheduled(cron = "0 */5 * * * ?") public void cleanJob() { // 批量锁定:原子操作,clean_status 0→1 - int locked = appJobDataMapper.update(null, new LambdaUpdateWrapper().set(AppJobData::getCleanStatus, 1).eq(AppJobData::getCleanStatus, 0).eq(AppJobData::getIsValid, 1).last("LIMIT " + batchSize)); + int locked = appJobDataMapper.update(null, + new LambdaUpdateWrapper() + .set(AppJobData::getCleanStatus, 1) + .eq(AppJobData::getCleanStatus, 0) + .eq(AppJobData::getIsValid, 1) + .last("LIMIT " + batchSize)); if (locked == 0) { return; @@ -77,7 +83,11 @@ public class JobCleanService { log.info("岗位清洗:锁定{}条数据", locked); // 查出刚锁定的数据 - List dataList = appJobDataMapper.selectList(new LambdaQueryWrapper().eq(AppJobData::getCleanStatus, 1).eq(AppJobData::getIsValid, 1).last("LIMIT " + batchSize)); + List dataList = appJobDataMapper.selectList( + new LambdaQueryWrapper() + .eq(AppJobData::getCleanStatus, 1) + .eq(AppJobData::getIsValid, 1) + .last("LIMIT " + batchSize)); // 多线程并发处理 for (AppJobData data : dataList) { @@ -176,12 +186,80 @@ public class JobCleanService { // 8. 写入业务表(短事务,通过独立Service保证@Transactional生效) jobCleanTransactionService.saveJobData(root, data, companyId, sourceId, regionCodes); + // 9. 技能标签匹配(第二次AI调用,失败不影响岗位入库) + try { + Long categoryId = root.path("categoryId").asLong(0); + String skillTagText = dictCacheService.getSkillTagText(categoryId); + if (skillTagText != null) { + String title = root.path("title").asText(""); + String desc = root.path("description").asText(""); + String req = root.path("requirement").asText(""); + List skillTagIds = matchSkillTags(title, desc, req, skillTagText, categoryId); + if (!skillTagIds.isEmpty()) { + // 查出刚插入的 job,拿 jobId + Job insertedJob = jobMapper.selectOne( + new LambdaQueryWrapper().eq(Job::getSourceId, sourceId).last("LIMIT 1")); + if (insertedJob != null) { + jobCleanTransactionService.saveSkillTagRelations(insertedJob.getId(), skillTagIds); + } + } + } + } catch (Exception ex) { + log.warn("技能标签匹配失败, id={}", data.getId(), ex); + } + } catch (Exception e) { log.error("AI 返回解析失败, id={}, response={}", data.getId(), aiResponse, e); // 保持 clean_status=1,由僵尸恢复任务重置 } } + /** + * 第二次AI调用:匹配技能标签 + *

传入岗位信息和该分类下的标签列表,AI返回匹配的标签ID数组

+ */ + private List matchSkillTags(String title, String description, String requirement, + String skillTagText, Long categoryId) { + String systemPrompt = """ + 你是一个技能标签匹配助手。根据岗位信息,从给定的技能标签列表中选出匹配的标签ID。 + 规则: + 1. 只能从给定列表中选择,不允许自创标签 + 2. 选择与岗位核心技能要求相关的标签,不重复 + 3. 只返回ID数组,如 [1, 3, 7],不要其他内容 + """; + + String userMessage = "【岗位信息】\n标题: " + title + + "\n职责: " + description + + "\n要求: " + requirement + + "\n\n【可选标签列表】\n" + skillTagText; + + String aiResponse = aiChatAbility.chat(systemPrompt, userMessage); + + // 解析返回的 ID 数组 + String json = aiResponse.trim(); + if (json.startsWith("```")) { + json = json.replaceAll("^```\\w*\\n?", "").replaceAll("\\n?```$", "").trim(); + } + + try { + JsonNode arrayNode = HttpTool.objectMapper.readTree(json); + Set validIds = dictCacheService.getSkillTagIds(categoryId); + List result = new ArrayList<>(); + if (arrayNode.isArray()) { + for (JsonNode node : arrayNode) { + long id = node.asLong(0); + if (id > 0 && validIds.contains(id)) { + result.add(id); + } + } + } + return result; + } catch (Exception e) { + log.warn("技能标签AI返回解析失败: {}", json, e); + return List.of(); + } + } + /** 构建系统提示词 */ private String buildSystemPrompt() { return """ diff --git a/manager/src/main/java/org/jiayunet/service/JobCleanTransactionService.java b/manager/src/main/java/org/jiayunet/service/JobCleanTransactionService.java index 4102c9c..f2ed5c4 100644 --- a/manager/src/main/java/org/jiayunet/service/JobCleanTransactionService.java +++ b/manager/src/main/java/org/jiayunet/service/JobCleanTransactionService.java @@ -8,10 +8,12 @@ import org.jiayunet.mapper.AppJobDataMapper; import org.jiayunet.mapper.CompanyMapper; import org.jiayunet.mapper.JobMapper; import org.jiayunet.mapper.JobRegionRelationMapper; +import org.jiayunet.mapper.JobSkillTagRelationMapper; import org.jiayunet.pojo.po.AppJobData; import org.jiayunet.pojo.po.Company; import org.jiayunet.pojo.po.Job; import org.jiayunet.pojo.po.JobRegionRelation; +import org.jiayunet.pojo.po.JobSkillTagRelation; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @@ -22,8 +24,8 @@ import java.util.List; /** * 岗位清洗事务服务 *

独立出来解决 @Transactional 同类自调用失效问题

- *

依赖:JobMapper、CompanyMapper、JobRegionRelationMapper、AppJobDataMapper

- *

使用表:bg_job(写入)、bg_company(查询/创建)、bg_job_region_relation(写入)、app_job_data(更新状态)

+ *

依赖:JobMapper、CompanyMapper、JobRegionRelationMapper、JobSkillTagRelationMapper、AppJobDataMapper

+ *

使用表:bg_job(写入)、bg_company(查询/创建)、bg_job_region_relation(写入)、bg_job_skill_tag_relation(写入)、app_job_data(更新状态)

* * @author zk */ @@ -43,6 +45,9 @@ public class JobCleanTransactionService { @Autowired private AppJobDataMapper appJobDataMapper; + @Autowired + private JobSkillTagRelationMapper jobSkillTagRelationMapper; + /** * 写入 bg_job + bg_job_region_relation + 更新 clean_status(短事务) */ @@ -76,13 +81,16 @@ public class JobCleanTransactionService { jobMapper.insert(job); - // 写入岗位-地区关联 - for (String regionCode : regionCodes) { - JobRegionRelation relation = new JobRegionRelation(); - relation.setJobId(job.getId()); - relation.setRegionCode(regionCode); - relation.setCreateTime(Instant.now()); - jobRegionRelationMapper.insert(relation); + // 写入岗位-地区关联(批量插入) + if (!regionCodes.isEmpty()) { + List relations = regionCodes.stream().map(regionCode -> { + JobRegionRelation relation = new JobRegionRelation(); + relation.setJobId(job.getId()); + relation.setRegionCode(regionCode); + relation.setCreateTime(Instant.now()); + return relation; + }).toList(); + jobRegionRelationMapper.batchInsert(relations); } // 更新清洗状态 @@ -113,6 +121,21 @@ public class JobCleanTransactionService { return newCompany.getId(); } + /** + * 写入岗位-技能标签关联(批量插入) + */ + @Transactional(rollbackFor = Exception.class) + public void saveSkillTagRelations(Long jobId, List skillTagIds) { + List relations = skillTagIds.stream().map(skillTagId -> { + JobSkillTagRelation relation = new JobSkillTagRelation(); + relation.setJobId(jobId); + relation.setSkillTagId(skillTagId); + relation.setCreateTime(Instant.now()); + return relation; + }).toList(); + jobSkillTagRelationMapper.batchInsert(relations); + } + /** 更新清洗状态 */ public void updateCleanStatus(Long id, int status) { appJobDataMapper.update(null,