修海数据清洗相关代码
This commit is contained in:
@@ -19,6 +19,6 @@ public interface AppJobDataMapper extends CommonMapper<AppJobData> {
|
||||
* 查询待清洗数据并加行锁(SELECT ... FOR UPDATE)
|
||||
* <p>必须在事务内调用,配合状态更新实现原子锁定</p>
|
||||
*/
|
||||
@Select("SELECT * FROM app_job_data WHERE clean_status = 0 AND is_valid = 1 LIMIT #{limit} FOR UPDATE")
|
||||
@Select("SELECT * FROM app_job_data WHERE clean_status = 'pending' LIMIT #{limit} FOR UPDATE")
|
||||
List<AppJobData> selectForUpdate(@Param("limit") int limit);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package org.jiayunet.pojo.po;
|
||||
|
||||
import com.baomidou.mybatisplus.annotation.IdType;
|
||||
import com.baomidou.mybatisplus.annotation.TableField;
|
||||
import com.baomidou.mybatisplus.annotation.TableId;
|
||||
import com.baomidou.mybatisplus.annotation.TableName;
|
||||
import lombok.Data;
|
||||
@@ -20,8 +21,9 @@ public class AppJobData {
|
||||
@TableId(type = IdType.AUTO)
|
||||
private Long id;
|
||||
|
||||
/** 关联爬取任务ID */
|
||||
private Long taskCrawlId;
|
||||
/** 关联urllistid */
|
||||
@TableField("urllistid")
|
||||
private Long urllistId;
|
||||
|
||||
/** 职位名称 */
|
||||
private String jobTitle;
|
||||
@@ -47,33 +49,30 @@ public class AppJobData {
|
||||
/** 详情页URL */
|
||||
private String detailUrl;
|
||||
|
||||
/** 招聘分类: 0=校招, 1=实习, 2=社招, 3=其他 */
|
||||
private Integer recruitCategory;
|
||||
|
||||
/** 内容哈希值,用于查重 */
|
||||
private String contentHash;
|
||||
|
||||
/** 数据来源 0=官网 1=平台 */
|
||||
private Integer sources;
|
||||
|
||||
/** 是否独立URL 0=页内展示 1=独立页面 */
|
||||
private Integer isIndependentUrl;
|
||||
|
||||
/** 是否有效 0=无效 1=有效 */
|
||||
private Integer isValid;
|
||||
|
||||
/** 有效期 */
|
||||
/** 发布日期 */
|
||||
private Instant expireAt;
|
||||
|
||||
/** 验证状态 pending=待验证 checking=验证中 checked=已验证 */
|
||||
private String checkStatus;
|
||||
|
||||
/** 清洗状态 0=待清洗 1=清洗中 2=已入库 3=已丢弃 */
|
||||
private Integer cleanStatus;
|
||||
|
||||
/** 上次验证时间 */
|
||||
private Instant lastCheckAt;
|
||||
|
||||
/** 创建时间 */
|
||||
private Instant createdAt;
|
||||
|
||||
/** 更新时间 */
|
||||
private Instant updatedAt;
|
||||
|
||||
/** 清洗状态: pending=待清洗 cleaning=清洗中 cleaned=已清洗 discarded=已丢弃 */
|
||||
private String cleanStatus;
|
||||
|
||||
/** 清洗开始时间 */
|
||||
private Instant cleanStartedAt;
|
||||
|
||||
/** 清洗完成时间 */
|
||||
private Instant cleanedAt;
|
||||
}
|
||||
|
||||
@@ -77,6 +77,12 @@ public class Job {
|
||||
/** 爬虫原始数据ID,用于去重 */
|
||||
private String sourceId;
|
||||
|
||||
/** 招聘分类 0=校招 1=实习 2=社招 3=其他 */
|
||||
private Integer recruitCategory;
|
||||
|
||||
/** 发布日期 */
|
||||
private Instant expireAt;
|
||||
|
||||
/** 状态 0=上架 1=下架 2=已失效 */
|
||||
private Integer status;
|
||||
|
||||
|
||||
@@ -102,9 +102,10 @@ public class JobCleanService {
|
||||
public void recoverZombie() {
|
||||
int recovered = appJobDataMapper.update(null,
|
||||
new LambdaUpdateWrapper<AppJobData>()
|
||||
.set(AppJobData::getCleanStatus, 0)
|
||||
.eq(AppJobData::getCleanStatus, 1)
|
||||
.lt(AppJobData::getUpdatedAt, Instant.now().minusSeconds(600)));
|
||||
.set(AppJobData::getCleanStatus, "pending")
|
||||
.set(AppJobData::getCleanStartedAt, null)
|
||||
.eq(AppJobData::getCleanStatus, "cleaning")
|
||||
.lt(AppJobData::getCleanStartedAt, Instant.now().minusSeconds(600)));
|
||||
|
||||
if (recovered > 0) {
|
||||
log.info("僵尸恢复:重置{}条数据", recovered);
|
||||
@@ -118,7 +119,7 @@ public class JobCleanService {
|
||||
public void cleanOne(AppJobData data) {
|
||||
// 1. 前置校验
|
||||
if (data.getDescription() == null || data.getDescription().length() < 20) {
|
||||
jobCleanTransactionService.updateCleanStatus(data.getId(), 3);
|
||||
jobCleanTransactionService.updateCleanStatus(data.getId(), "discarded");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -134,7 +135,7 @@ public class JobCleanService {
|
||||
|
||||
// valid 校验
|
||||
if (!root.path("valid").asBoolean(false)) {
|
||||
jobCleanTransactionService.updateCleanStatus(data.getId(), 3);
|
||||
jobCleanTransactionService.updateCleanStatus(data.getId(), "discarded");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -142,7 +143,7 @@ public class JobCleanService {
|
||||
String sourceId = String.valueOf(data.getId());
|
||||
Long existJob = jobMapper.selectCount(new LambdaQueryWrapper<Job>().eq(Job::getSourceId, sourceId));
|
||||
if (existJob > 0) {
|
||||
jobCleanTransactionService.updateCleanStatus(data.getId(), 2);
|
||||
jobCleanTransactionService.updateCleanStatus(data.getId(), "cleaned");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -86,6 +86,10 @@ public class JobCleanTransactionService {
|
||||
Long requiredIndustryId = root.path("requiredIndustryId").asLong(0);
|
||||
job.setRequiredIndustryId(requiredIndustryId == 0 ? null : requiredIndustryId);
|
||||
|
||||
// 从原始数据透传 recruit_category 和 expire_at
|
||||
job.setRecruitCategory(data.getRecruitCategory());
|
||||
job.setExpireAt(data.getExpireAt());
|
||||
|
||||
job.setSourceUrl(data.getDetailUrl());
|
||||
job.setSourceId(sourceId);
|
||||
job.setStatus(0);
|
||||
@@ -106,8 +110,8 @@ public class JobCleanTransactionService {
|
||||
jobRegionRelationMapper.batchInsert(relations);
|
||||
}
|
||||
|
||||
// 更新清洗状态
|
||||
updateCleanStatus(data.getId(), 2);
|
||||
// 更新清洗状态为已清洗
|
||||
updateCleanStatus(data.getId(), "cleaned");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -162,11 +166,16 @@ public class JobCleanTransactionService {
|
||||
}
|
||||
|
||||
/** 更新清洗状态 */
|
||||
public void updateCleanStatus(Long id, int status) {
|
||||
appJobDataMapper.update(null,
|
||||
new LambdaUpdateWrapper<AppJobData>()
|
||||
.set(AppJobData::getCleanStatus, status)
|
||||
.eq(AppJobData::getId, id));
|
||||
public void updateCleanStatus(Long id, String status) {
|
||||
LambdaUpdateWrapper<AppJobData> wrapper = new LambdaUpdateWrapper<AppJobData>()
|
||||
.set(AppJobData::getCleanStatus, status)
|
||||
.eq(AppJobData::getId, id);
|
||||
|
||||
if ("cleaned".equals(status)) {
|
||||
wrapper.set(AppJobData::getCleanedAt, Instant.now());
|
||||
}
|
||||
|
||||
appJobDataMapper.update(null, wrapper);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -185,7 +194,8 @@ public class JobCleanTransactionService {
|
||||
List<Long> ids = dataList.stream().map(AppJobData::getId).toList();
|
||||
appJobDataMapper.update(null,
|
||||
new LambdaUpdateWrapper<AppJobData>()
|
||||
.set(AppJobData::getCleanStatus, 1)
|
||||
.set(AppJobData::getCleanStatus, "cleaning")
|
||||
.set(AppJobData::getCleanStartedAt, Instant.now())
|
||||
.in(AppJobData::getId, ids));
|
||||
|
||||
return dataList;
|
||||
|
||||
Reference in New Issue
Block a user