Files
post_crawler/crawl/task_fetcher.py
T
kgod c06f595559 feat: add crawl scripts for recruitment websites
- btyy (倍特药业), fullsemi (富芯半导体): 北森平台爬虫
- hotjob (中国五矿): hotjob平台爬虫
- leinao (中科类脑): 静态HTML爬虫
- task_fetcher: 原子锁获取任务
- post.md: 抓取技能文档
- export_har: mitmproxy HAR导出工具
2026-05-27 23:48:30 +08:00

74 lines
1.7 KiB
Python

import pymysql
from datetime import datetime
DB_CONFIG = {
"host": "192.168.31.105",
"port": 3306,
"user": "root",
"password": "123456",
"database": "table_comple",
"charset": "utf8mb4",
}
def fetch_next_task():
"""
从 app_url_list 获取下一个待处理的任务。
使用 SELECT ... FOR UPDATE 原子锁,按 finished_at 最早排序。
获取后立即更新 started_at 为当前时间。
:return: {"id": int, "url": str, "company": str} 或 None
"""
conn = pymysql.connect(**DB_CONFIG)
try:
conn.begin()
cursor = conn.cursor(pymysql.cursors.DictCursor)
cursor.execute("""
SELECT id, input_url, input_company_name
FROM app_url_list
WHERE status != 'processing'
ORDER BY finished_at ASC, id ASC
LIMIT 1
FOR UPDATE
""")
row = cursor.fetchone()
if not row:
conn.rollback()
return None
cursor.execute("""
UPDATE app_url_list
SET started_at = %s, status = 'processing'
WHERE id = %s
""", (datetime.now(), row["id"]))
conn.commit()
return {
"id": row["id"],
"url": row["input_url"],
"company": row["input_company_name"],
}
except Exception as e:
conn.rollback()
raise e
finally:
conn.close()
if __name__ == "__main__":
import sys
sys.stdout.reconfigure(encoding="utf-8")
task = fetch_next_task()
if task:
print(f"获取任务成功:")
print(f" ID: {task['id']}")
print(f" URL: {task['url']}")
print(f" 公司: {task['company']}")
else:
print("没有可用任务")