From b3f35dbdea8b7d9a00031469d78d95244ceb66dd Mon Sep 17 00:00:00 2001 From: kgod Date: Tue, 26 May 2026 21:03:24 +0800 Subject: [PATCH] feat: Implement job scheduling and company recruitment search functionality - Added a new scheduler module to manage periodic jobs for recruitment data processing. - Created a search_company_graph module to handle the logic for searching company recruitment pages. - Implemented nodes for searching, extracting links, verifying recruitment lists, and navigating to recruitment pages. - Developed prompts for LLM to guide the extraction and verification processes. - Added state management for tracking the search process and results. - Created a test script for crawling job listings from various company websites. --- src/scheduler/scheduler.py | 117 +++++ src/search_company_graph/__init__.py | 4 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 301 bytes .../__pycache__/graph.cpython-312.pyc | Bin 0 -> 3041 bytes .../__pycache__/main.cpython-312.pyc | Bin 0 -> 3924 bytes .../__pycache__/nodes.cpython-312.pyc | Bin 0 -> 18448 bytes .../__pycache__/prompts.cpython-312.pyc | Bin 0 -> 3474 bytes .../__pycache__/state.cpython-312.pyc | Bin 0 -> 969 bytes src/search_company_graph/graph.py | 105 ++++ src/search_company_graph/main.py | 88 ++++ src/search_company_graph/nodes.py | 457 ++++++++++++++++++ src/search_company_graph/prompts.py | 119 +++++ src/search_company_graph/state.py | 31 ++ test_crawler.py | 62 +++ 14 files changed, 983 insertions(+) create mode 100644 src/scheduler/scheduler.py create mode 100644 src/search_company_graph/__init__.py create mode 100644 src/search_company_graph/__pycache__/__init__.cpython-312.pyc create mode 100644 src/search_company_graph/__pycache__/graph.cpython-312.pyc create mode 100644 src/search_company_graph/__pycache__/main.cpython-312.pyc create mode 100644 src/search_company_graph/__pycache__/nodes.cpython-312.pyc create mode 100644 src/search_company_graph/__pycache__/prompts.cpython-312.pyc create mode 100644 src/search_company_graph/__pycache__/state.cpython-312.pyc create mode 100644 src/search_company_graph/graph.py create mode 100644 src/search_company_graph/main.py create mode 100644 src/search_company_graph/nodes.py create mode 100644 src/search_company_graph/prompts.py create mode 100644 src/search_company_graph/state.py create mode 100644 test_crawler.py diff --git a/src/scheduler/scheduler.py b/src/scheduler/scheduler.py new file mode 100644 index 00000000..6c9cab97 --- /dev/null +++ b/src/scheduler/scheduler.py @@ -0,0 +1,117 @@ +"""定时任务调度器""" +import logging +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from datetime import datetime + +from src.config import settings +from src.scheduler.jobs import ( + job_step1_search, + job_step2_page, + job_step3_next, + job_step4_detail, + job_step5_crawl, + job_periodic_crawl, + job_check_validity, + job_generate_company, +) + +logger = logging.getLogger(__name__) + +# 创建调度器 +scheduler = AsyncIOScheduler( + job_defaults={ + 'coalesce': True, # 合并错过的任务 + 'max_instances': 3, # 同一任务最多3个实例并发 + } +) + + +def start_scheduler(): + """启动调度器并注册所有任务""" + + # Step 1: 搜索招聘页面 + + scheduler.add_job( + job_step1_search, + 'interval', + seconds=settings.job_step1_search_interval, + id='job_step1_search', + max_instances=2, + name='搜索招聘页面', + next_run_time=datetime.now() # 立即执行第一次 + ) + + # Step 2: 岗位列表分析 + scheduler.add_job( + job_step2_page, + 'interval', + seconds=settings.job_step2_page_interval, + id='job_step2_page', + name='岗位列表分析' + ) + + # Step 3: 分页分析 + scheduler.add_job( + job_step3_next, + 'interval', + seconds=settings.job_step3_next_interval, + id='job_step3_next', + name='分页分析' + ) + + #Step 4: 详情页分析 + scheduler.add_job( + job_step4_detail, + 'interval', + seconds=settings.job_step4_detail_interval, + id='job_step4_detail', + name='详情页分析' + ) + + # Step 5: 数据爬取 + scheduler.add_job( + job_step5_crawl, + 'interval', + seconds=settings.job_step5_crawl_interval, + id='job_step5_crawl', + next_run_time=datetime.now(), + name='数据爬取' + ) + + # # 周期爬取 + # scheduler.add_job( + # job_periodic_crawl, + # 'interval', + # seconds=settings.job_periodic_crawl_interval, + # id='job_periodic_crawl', + # name='周期爬取' + # ) + # + # # 有效性检查 + # scheduler.add_job( + # job_check_validity, + # 'interval', + # seconds=settings.job_check_validity_interval, + # id='job_check_validity', + # name='有效性检查' + # ) + + # AI生成公司名 + scheduler.add_job( + job_generate_company, + 'interval', + seconds=settings.job_generate_company_interval, + id='job_generate_company', + max_instances=1, # 单实例 + next_run_time=datetime.now(), + name='AI生成公司名' + ) + + scheduler.start() + logger.info("定时任务调度器已启动") + + +def shutdown_scheduler(): + """关闭调度器""" + scheduler.shutdown() + logger.info("定时任务调度器已关闭") diff --git a/src/search_company_graph/__init__.py b/src/search_company_graph/__init__.py new file mode 100644 index 00000000..2e39d57d --- /dev/null +++ b/src/search_company_graph/__init__.py @@ -0,0 +1,4 @@ +"""搜索公司招聘页面模块""" +from .main import search_company_recruitment + +__all__ = ["search_company_recruitment"] diff --git a/src/search_company_graph/__pycache__/__init__.cpython-312.pyc b/src/search_company_graph/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58232c197585fcc963ff3da052ec24d639b4cab3 GIT binary patch literal 301 zcmX@j%ge<81YI4xSq(t?F^B^LOi;#WDIjAyLkdF_LkeRGQx0P;Qxp>;Lke>`V-#~G zizaKS98kft33HxrS@g7Z&C~uJ&$?&7Xq@qK;ntUP7d>0C@af#?FBySKG#PJ66{jW^ zC1=DZ=jRqA=2gZQr6w1ZW|rip=9LsN1C{t`vfN^ik59=@j*q{^lAD;B2Nnj2CFbPB z$FF4g3{=JN%h}l~CbT%Us5mA&KQA@LCAB!aB)=dgKP@e_s30*jrno3M2F@&om}4~II(@{-WlNn zk|R-(Kn1Bl;wvgqAr%m*K)+B6qJKbNLSqSYq5`SdNuFvHp8C|8JzwI4M6Bf9ncdmB z-QWCn#((;JZUXJ(kgspOhme0zv6@g4=7vBxPh=7%GAG;OT*4N%CHOGU5t)~TxR4OT zB9q0qJ>dvD63(zQ;R?GF?ywu??TRQn6i<&JJC6%tZ<+1_-B+f&L9Zy&J)rwxudT-> zdqJ*D`$83|S~$+WNV4}68W}j)Qut+X`ueH-50{?*Hav?b4%rO%em$ZqY~A4ZcI}zv zaGoLkf-cq>Z{{ zlICDgV`Y{>;^xySJv#-TC+arNZcN{_c;{SH|<1i?f;G*|QG{r^oX{7iaGN zH1o%a!pP|K&)*ppQ8;i+#zUrPTr(U8BN|GEC#fj1WM23NXYjp>euJYBa-!k4TmJiK z_lMn|X$sZ4`_-h0U*X&L~h4)l&fhCFk4PoxLalT~Z8j@F#NKIf^Qg9Wk(EF>HQs~v=u!RjRwHrz|d zG43d*S&%itMB!?taAmx3;Z}a+d&H{@DOv&RhBK*_;_E_WSg2_gtb}X~3`0QJH1yxn zLRaAyPxPP??Jt?)N6?Yh1RY$zvMV2nQ-8>cc z!OF_FE2qK=bm7ZvgtQp4r>eoFdH30yU(ehcoW6Iq@a?GKrizw|>x=+mkD5x#hNw`g zQp4U8iNzJ!@G)9g#AZm)8gOQk!UUkk1i*X+KzwvbTfqpgsD+@*kfNP9eHh`9rmSO4 z&R@GQQI~ZDbN)J*@HW1R%6!cGz|YF4$hNYWFaL=6GG5q@aysJ8+EJT@SARDl@Wf2a z>S58uEPw0%%+SsJ$m!`z$EQ#JTF49?S-wBaDT?$CYG=XOyDrt&JKUsm0<|z=YMp-U1dA1@6!~paX{0ufF2C=$dTZa{G&^ z`W=%(AQxyjf8d(~lg*oMS5E~#$aEpp!O`7g6;n-{{&H-}`PU)Vt4yrH$$WEy#NJmI z+~qvc=Tni6cVo>$>KLXf(VATtu>c&Oe}CLcR=jSv_E>RS(N@s$T&%s;R@7O!LgUxk z+I6q#b&0Ltnq3$d5->X@=7S|o=5&lJYqAJs&oOG7C^3xf!8Es|VR2k^%H{kDI=&*- zbYvuJ@CD}*|I6(a^#*Il!rlc*w#%+je$gqH^J3RAm$6p1SCrjLPS9*wEsPT-aX$%p zQiyAQ=yLwv@1>G_`2ni}i;^Hx4)e<#@&lhcY~{LckXMq9vm+i!d@e^izg!cg3f42T z&UD1pXe6$6wp;7lz>$VoBt!0qjPn%h4na>m?2! zVUZJW)tQ~cJ5TMJ6dD*4xW~Viq)tl;Q(Q9|p*w5|S<>7| z7*l!1gb+kOHrg9(sAaRLRG66hpM6OtM}^oxzn$Q&@kq)0ZODoSPwOh5GPolg?b z%+L>A&F=no-`lru-<^JM?{6lP5kYxBX7a5C^mj5)i!6b`{U(gi0OC+9;xMQ2<5n#9 znpO?;l%MLPTWO4VEk|#92_eDBYkaI~B-{HeS*#)sYY?x8O7n)3unQ@^Vx`Y*pLI^P zaZ;nK#>8W-Kd+N*Wo69xO`Yaevy63qOJ{y-K43Ga=k=Tcs*x|?^7uktUkU~`>N)mZ zZEF!{dKa}8XU=B@f3a^~u73)K`Qno^)2b&Jb!)mc4oidqL2}3OcXxLQ-u8|#%<0|^^KO5*10d@Wcz2k0wF~Z02aJN9 zA$OqL6>xX*lEE7YMZ&I#;Ft6ryqoj$q9`!}AC3qC4>jWz>2y%HL%n|*W>+B$k2X9L zdW2N>7oqL&kbempzts#-?`gHDhl*0ZnXnQwYVm1RlEy_fuL}Kx+7=g@(!Xy8r4&rD`TVR zJ7utnGv=;~>U~A3)odTl{FO3iqGuv+d`2%!|JApr_c!`VviqutGpnn6@}had_oI0b zBic7hm1lc3su@3L-VlsDgj(R2d52(}`!-_qHi`28!D>B<5@Y(45b z26Z*m8(y!bE%=D$%}h3 zN-|h2(^ZbYeyBHkn#e27q}7#^IO6K zNj_4MI~4MFyF?|Oxa3qL>4_SNfIt{o(uM>tIIn|*v^ltmm#B6=ENMOdpvc2kBwhmH zVkMF|BtJR+Mi`SnLxP`;nhcT1yWb4;T|x$C%YI z{K}U0vC+s*G(?2&%jq+C2-ML{^C1qLHzaj=6FZEW`ecdgi*_p z#v`?(g$pLwIx<`vU%CHvS$Nwv=Y;8~X{dAD)|e=3jIDZlZS4eWe|!;mST^VI>;toh zD#lBf#C~+$QaWj^ibvCWIP*Zw-kSdT3D!Q!+W)l%qx>p}-MVmFDY(*lA_HUet_W@Yh8}AQ{nNXPJIPPqao@7R--Wv9p31Db*5Oi9vH@wyar) zot65Lg|cQn(Y#Dyr;WKxGX%33EL^t8ZK2$j$Sqi_(p)Z+aRnxtl`^);xT@H>RC{>| zcD`!7T+a}^lma_f5VN9CcZI<#4D1!1j$pG6)_UOe&J;os@|Eut0OiWZ;hvmpUlg63 zxdK8e*5oGI9vp^Cpi($W)q=|u7VJ3^I#==rTAzN>hu28hl34~=RxC} z!5ubW$>jToQ@j3>j14P!LrIGbTPa|e2)W#UbEwaj8vNvG9morWAB1vf9F&qH;2upF zSE5L6An^bz!i&&LG~pYe8sJK(;u>gVLb+X3+56_5qM9N9c+qPK(`&uXJG5ccSUy2l zT-TXy>P@5hH52;zH;PJQ%gLW^`F<)MiC14^EALULxC~guU>-F)uIcBc3y`_-kx2OJ zor30K+>e_J>3z!pIW#R!$8(a-<>G=KmkU^%d_8!B5+!zvk}d>a7k=*sf%r?lGRS8F zXhFV%uMWl=0Ct5FL2@O+jb1+yG9fofcv5lp6Qox8CUJ+nN~$T$iqu@9o!Q#=e!n!*@g zN9+w`zJZLlkQHc#W@B5gQl*o$Zs*EvD|f!N?X7t8Uh%-9Bl?3)iSn1n%3n^D*N>Ig zC(4_~%A3Y#Et{Z!kVFe^p%-tV`g_b0Ts6EfjiCEd=|_@$g9a6_L_B z8c0cuH`{<4ryDt++P~Ap?K%6q_Zz(t zX?D+k2Yqwrd*1JLU*8=*G#GRggl7f~ogd#%QU8T6p`Z!^cUYRDMk$`!Pw_ObaMJr} zcq*NWF6Dk@7qg$CF`aR$y43sC1XejUUF?3S#;bXaQ`@E6uOl$)OzP6_*LNBA8@i19 zjWng8wo<(INs8AA%AY9Z{`M!6)FepNN2QubssU1sQK>0>HlHk*d6STOh~ZP7RP9eA zu!1)~$?UiAsX}TnFAd()d5fTf_Y5eT4)2*!?-}r(74@D8@7YoBS@527LSfD6(*vHj zo_YM{<8RNs@x7bRP2NVlo2?4Jc9Y#D?CRkKr(d#}$65?n5Oy8!RH ztwOgT+MP|#uG=uJ6eK0J2zIfpqs47^3s$8+ZS&Xm?x}Cs+uF3fd1p)O1ABJwdSI_V z|0|pKY~Ql4b z`M6uOx4D}f-G^O!1XrKa-Dif(`BNivr(U@I{Co4y{ov+1&(DAFY+o)U&kdZpJuvjp zo~F6M(Oa)f{OR4{`LhFauYR}s#?+5rgW2H)SDWbQbvt^x{Y;FTw2pJ;6t?Y$ibx?x+mY`rH$12jXTBmi7x z5Isd*Wpb~k=SXE+X41C~G!7OIx`wP?X6sdR#(49Lxq4vBV8aYkEoO(RCSJ}Rkp1PB zbCHE)1z5_ch17pV5FLk31eXxE{p}wQ-}>n{gNv653G-t=og4Y#jj3PGfBVH--yD;d z@b)YJA{rsnpXzY6ib9*%=Wurk-R@SLWU_j5gRjoN@cNDSUYdXQ4Kly5bRzB%F%_dU zj4T+D70L(5&pgu8;}o$^QbEi>2=}EtEjb9~g(6%sR|!Nox9L~85~-?bhTA1E8DnW) zW>>2CJ3F2O0ZxqHMu zoDR9p{NM}Frd;9Xcb}Yl?&Y}`$H^30wTn+Vb}LVDWX9tryAX?Dge&2)LWGl@bv1{R zDjuB4*)yu1cH0g{7<4$-Mb>fqNXs$Zd{w3LVH4AHnyL3kqXg4GDN{RG7` zk3y*zm{H}oR4QoK4QnTdPBsKZp1dm5Q@ExwR1^Bc!yFfQ^1%GD@ zni!0rM|t#XC>(!xnj2{o2|VjzUZh@9gvuj6Jk!rS2E6`+HUtmIDWK7FBhSIs1F~@YrSEgw_U>x>)4PK=fAr4nGp}+k_X(#^DN}gv z=IcX$dKdnN=caym`{`$I{q#Gx2j02$-f6#y@9Apm>2?DVarW4GfxkTngaWvO#_i}5 zdivbWBCei21$Ksxiwk8|u5JVO=z?@C293JhT$>Zl5Ld?mSLj1RWqYKDKVc)O%|N%y z++v?#U7xrCtZF;bCx|Cnf`_l(=`1TH<0|Lmft7RJJt4?-w7r~j0&6U_+79)Io9%5K zWr7ni6r46u=;}Etlvx+)Og0&AsvV+^cWi`uPuPIX`99_*J64yIt_79qK{(qsOmuIR&BD&k9HF&OR)p>J=T`ZmY`A^n&;!E&&|;8oS#qboILZ z4C3IY1s4jMWX(h*TYh6Kmf{*nMB?o7L3E4yY=F9vnmLy9ht!fmO+b;Q%?>b1gT&q3d(e=yJKV|@WBeQ;^ z;=G!!U#~u|A#ed*U#C9LsW81n1vSnuqoLsWa&7%u)%jHlOkYDIyp}<@j>dSSw!xsf zz;X?&>OvX4X*qkrnuc&)60~rURYC4WgA(AjFm(ltkggHxYlclEQ(vHV(Iub&20l!Q zfj%k-WmU-Zv7h$PuB=6Bh2_lnS}jyNP;(Crbo~&}b)c9DX*vURIY`m_8D8ySj)G92 zrQ9Li1$cpn36%)<5v1JxDvwH3cvMJ7JGJsSqT@yS1pXds(ZeiKhK-crwP(Ylq^aQ! z21llLhdG8POl4uL6%=&<#?!BX5vvp8q~UcZ0m&CBKFOoxiDVA29$*F;G{6RcSs8vQ zO!1-TN(OYN&M=LSjMNk6R4o+n89)5a!|yck8|1zugt^iSF|7OYE%fIPfzKy@<~f$_<(fE>KV@d)tP_JwHUN(AdHUCPN2aC*DT z)db3k4TTK5quW(hD&t*hwJst#$cP0S0FG7fhWUD=&n=XdhDKPr=;XyR=*6aU?$xJ3 zhX2#M--Q{KPY2f_3WtIdw_v)+CIhq8(F@aQz^UIh=5c z%N!t|pg@LC8;%ur3Q5Elf`;4mRfoHythB@J?yaq?lovWwd_f5pDH1Kz$Q_SlA_{URjjo>do<7*FU6r^Deia#=pTU7ym0~TF z5!Yd~9wU^E?y3(A4Adn}RLz;S>g^hT!98!BpdH zg>m@xAh!4Rp;4e4R6p9`LahsT7Y11pG@~RKG^Yex7q>$xRH0pUAcYWVFrcPDnlqd+ z1gPv%_70VurTq()ovIz!`RQ6ppK;xiHG1&$L21bzucc)+xkX}Iphjl4#H9VZDQA%J z8O^6UhB`+2#vbt+3kQ{usm~pAO(zuv6e@$|TH%^0W~$9wxM?8m3V9|0qVA6v(1d zasx}LwDf@{kSe9Tia(^<0)rcW5RgVI6txGm9FvD$*oh3 zzqY)+bE@vL0tDL!zVg9+mu*t+zTdO^14=5b67e;kT0gXY#N$mWmYCvzi-ysE7PyZx z?WI2pq*1vI^yeUersc)Rv43-Y1*Y+FOQVJ!r5mc4v21{TMmN+j|6B;rc}sG`TE+Qd zx?#2Ydz5htr) z@nmt5sOt#@N>J{DqTK?)P2o|v=o29FpgEcdhhc#adI$zy1?sV~Gc3Y{(kUurY6iiN z5mP)&B0(#nzPC`&N}~2P|5xpwP=dJS1aXTP;}Or0xOGAqBW|&McwiTkoj`Gp7N2gu z_{RMB)5uxv>tKRo_yy6}xH4FuMT7^iN-e_pWO@z9u@~sI9aTf_kl+R_78)z(9tE*R zrX00gDOkS2>RQV6$B9Go84qdSZXk8xrm$Tbrj#vGCrHn+NG$!vM*F!oJD|dq35zko zVD&^2O)Rg9yfqV&E8<=}X@J?RD=mBY>!k;mTT2OHJ!yo2*FjRz0oMjdwW?*Rst^Ug zS|(n8-S(}`yTNo{-?G`SY8QL@dI?>%stEZM(Q@lI;LyPwA++MyM6{v!Rb4ooI0aeU z2w5b06ad%vA(H7JQ=l?m&o6qp>r7XG(pBi;=o>TcSab!Mf1z|)nS+K;GbvM^FV%7f zh?m)CGLLFcYo+XE6T7CWepUB&o!7K|P<1uQV$>kFV zr#Ag+*W0_irX2}NUQbR5tbvX`zYBEN0>nmTz6wc7c>c}R3}`-1Z_1#bv}|IjGxR1k z^IbJSg4)At|Xe!f8s#1#<&hB4YH_u(fa# zELTWhJ6reLPFGcBS9y+0}8EpYnM{BrXI}DSDMC67KYhggh z;%Y))2{cczzadsyu)sw~;KG_=9TJ8Y3_Q#07Ow-umQJJr@=3h@oMAz|u!ZyQjL-O! zAOT-EJ~x`;Nr>2L<4g4`qt+wYjaLA2jqsYp+9H7*5~=|B2h#q(}O^r5fzy7F13Tw= zbK>jL`Uj-t4|;WbW|=(_v&W}1o?14vOtLJS*y`2Q%rZ3+Qxi}qRR$0;XN^mzjZ4S- zrv%Bk)N5>ulra z*OJnZJR7dEdL++=d)MVM*=z5)&cO6NZfh){C+S87^JXqU=M5m*G3Sj0UPd>nne)rD zApHVEH?qtH6@#!Yy-`nH;OIsjbD@wyxFi$P>*&TD=E4Ruz!yz)V;*xcr3&CnP#0n~ ziCGfsXsC5buWig!Ua}}KH-knv6WhO(hh<`jJ*qWCn*8#_K18U1WGkUI37z4qmrx2t z;l;|iu;CC06!t|zqRXL(+!kuTp8-0*jg(+6)Ob{# zcs&{yz+edKSwHJx#dHswkSOXx7`aqnFl5nmh~-`MaK1;=i3BSyc-T-4Fd433qAuHb zy&JhF37#-ZLxW>9^K|DdECYB#O=8&8_OnsM-ssWt$w70b$-@GfXQFz~VJMG=XWJ=Y zYTC05A?8Csyz$1K1R&l8U`h>Z54^^sj%5{CH&nk(0T);5e)VJOmf$n);wq&t8=58b zy9$UsaaWMN6J+a)2sex}UFt&$B~$vUgWTNUw}Q6Jh~aa1VRf6$W>pZfCa!|&czcEB zCa_jDll$6mdreKdxLcD?-2BIh&}|G6KCO)3*kBqog0(VDZUmM)< z0)DdkPh273PRP-HwCAuOq8uf22jZ5YLz|D=g;mA-G3x+E^$>x-0I~{yQjq);nVGN* zbuXAn;d)8wbavs5G8!es)mS#U8}V^EPD1ZEeX@mQ8)Ugq@j{*=2br@NPz(n@7PL?t zc8N!z$a`4!{xEBSYwgVPX>-NE&RFK6+c?AA4^-S(G;1iEHk3^|FXzk{9vawkO_w&S zTRyE@K2yHItJ^rx5MZ<_oi8JM^wHCgzS=aEK2vJ)~eZi~B@!ejoMk-h(t=b~#w$3oyu9XDKW zacs$$MKTurp5?&6kXiwzj+|w_ynJ6?fsZTvRIAdf1C&ao4pdNPuq+T>qEYFf5y`ZC zqJE-UGFHy8E3W5rzQQ73;SyhQ2~;wH4Me5Y{WYM1vVU{!hM9c4UJWGuytbgBhB{w{ zHi+|92H}e3kbWVn9(l~gDlkb<7i+YQxyp-mX^k1mODQzKm&_{2y_BIsI9Ca93~#^% zCA{G=*nP2zKqq9y2UHFc3jK<9DlAkg7AXUEh;SJt2-peu30#2Donaml>HQ}`5JX~C zv`W#>bl;b_C0lCXn|?OEBVVG8B>c$f!}#|8v}!ieko;Peu^F zqEED&BN&5RN&1WQLW~GY`cM}pa%FWbck7K`-5z~o?(JX5&XkLKP!c7$4JwK}Mra3W zzWv`2`p_Rzmd~m|_$2PLvVc1G*3VFAd}G+J5=0T)08!bV9~ziHI}}swCAfN zjATa^m(eC8i@NAZB_jP2_hE#)l2~YDqYR2nVh2VbGEt%fBjQnu0+TG6D?IhK;%LZAS8!}p0Ye{ zI_hwN9~x*ttw)JtviMCXdkZUxu(@Pwk1=&*%TU*>amloC$-q{hC3|$=>3vc`<)qnb zsTpYYW#+xCJ)@P1R!tu9X0DSM3s^?zWxR9|~XB)t;?WteOVb?wwH>iQddw!01yoG;jl@>FUYG zx3;~xZL(G}ZkS;=Udt|vM#)qzRc!cm%5=phsl4IWD=)QN&i8Kq zn$);Y+V`+@pj~>nLozyM*iINrM&@ABwT#@chyRda9o!mFq#8=WXKvJS+A*Fat=%E* z-8bU!rthCM?UzjZ<2=|T(^431N|rA>e>S^nI=gDJ?sBn|UFFT*J)5>$O4|+CWaLNt zyg`ktCQHCVWiGpGqjIbv#0Rx7JX6*+eTrn>HKT97Ua-tpvU)0Ix}?@uQtB(Sf~t^a z1pUHb44B}`a-a#^_l)JUhN@{p)s%k5usKSLI@jdYtwSYB_XSGS-E^V_RZgmBSL~Qx zu|uldDe0PKm|fS<445}|2=7bAS|we@3}d^lG0bWTKhYFk&splrDuVI8Ty_Spa80@Z zT;b|qaWj|rEcu_Rl&A$MK@0kZf-;qT9?(-M+`W*mpg-&@m`_ zW&7n8DgD72(;i=H_SoTx`ZGPBq*h!nt?~tz-&axTTd@)*FE{-!l**D0x?k*qfFI&t z1F6u#y+9_xsXU^+u?w{~*f6=JzQU@>2d4{HFI3_*P~~r~7SMzruTO2NgIgPV`wHfK z`cj1JQn#;GTrknwtJN1$7=$hL`H*q3n%<#iF4n99_|meB9VLqQX?llN{k|d>;e2|B zUj2RngK%;B4zuF@T6#ya`u+6`!W%Oo{Wlr(jw1DMGO^5Wa#Nt4-&D{|I_5XFdOf`S z+xmj0Wa@G@-DF@c=P(Et)SDsWgK8Sk_+Yhmhf?*yMkSil@-fPnJU=1jE6J18i9$zQa1}<)IVc&46v#Lwi}9#JJA%)|ogLLe zcH&-QTZk&3^JRLB+A#^wP-yi122fR%olD||2(O8;3e`)b2=^O7=|$l=F8n0;9^QWu zb3|B0?pzjE310cI3cj!=^n~g~T=-x26jsF(_JpEpT=-x2lpmVGg*|=gHB0s+dklvg z;q;e+Aua*E4N=IbUBkgDuZSB2yvFV_vKLT{jthJtUvzFs=u9TSJSkM+3#g4L3Y+3D zgN?@orzYVEY^RY=#GWWnsTISAiDLa9pj4xFNCI5b6oR=YZ2sgkJo?yCgCBj&=_D$3 zO27UwJ*d>=8BwW8`(GzD#aInritAZPzT*0A z#DFq7Gk3nQXkEk`Jv(N_$w zvVvIy)&Zi|LLWi>oqSrzmEAmdj_YHgVCBjc-2Ay$thbT1M%`**)wg~o*XCAX5m1un zpZ$KEhAf*MZX;@asIG_m4ePPo{OI`ncLvd+;OS@QUzn72@aEXAqCY^o@y@g1DhoRl z>#!@(_9!R>dNE$4$zzXG(qRA#m2{XzGGmd+LKAEl>9A2m{sI$}^@u|Uv?m)(!g&!z z9w&=@R$0rnfAq<}li=Mbg1tD}i~)u5%kih*3El1Wp^hEFVd(j}cL!jiz;*BD z>+iwL)N-;-G9bkTN9BSWqNw$c4@_q%Vm#?f5hCx-H0aYG8s{>X3Qyo>^NRJecT2EW0+=>~q?XqQH=QYE! zS;NX{!%Ck43=AcARZ63JV9Q;WO0vk8rRR2gb@!o~4@&3acco@@gWmYeMsg3iY-06K z>t3sysFaMgGwga8fA$j4W_`tLr&dlE-xpEngLkA$#<%=v_ba=lB{h<6)eN&bk!=8M z9I5Fe?P%rjnu`Z^1{fpYV@@9}KV3d{^yP1y`G(hAIcr`uZC*8~@fj_%#ud}X6<%Z2 zEL$b9Ro9G`Q(Z${V+~#-H_LJo%Rw1T`F77Nn}#nJzcmZX60iV=P2#w;4OBvM-+_DwCo=7BYV7|Xg5b+ zET(rQGZ#zhE8*pROTjJ+^?nV#%gnsLia~f?eHCQomiZu?q_3u%lb8?IVEPp!-E3s8BxCxO<#e-&xl+y`yrQ0lMn9}d-wmb% zmfme(KGHG>8;p?tQ4YO3pZO@a2H@YCX~5#Q7VWN$%HQUv@2XY)7O(?&l?rofRS0iX z0{lA#4Nd<}MPio3NhAg&Al*b_vv#*m`MaFr-74kpHqropOkq18t5gW^ zvUYcl^5aYerexCy=P&?+@fq3PSo(di~~M#L8n3D-a!n?gx<&fX9+)hpZvc>Hi$1kVPFQ7 z>qCgZd`Ht)Df7Qm#{Z;B{z$F(BgXmvNp(z99d}tJT{D(-mx9M#Jwzkx!f*pU;=q>% z`cr)_ojuVIpx`m3yn~OxhWqG}@wPh@JOYo=g|vBcNq~aK)RH^+2yCNs>D6P0vCirV z9_y?QY$&DGV~++XcuctO;3Ke??xbn;`0@aJu}iBbi|>$UU_ZFqLZO@h_;%6iskFP~ JN&Y9p{|B_~Fya6J literal 0 HcmV?d00001 diff --git a/src/search_company_graph/__pycache__/prompts.cpython-312.pyc b/src/search_company_graph/__pycache__/prompts.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f415645284ac6de1e6c9c1a96b18d2575ef9052 GIT binary patch literal 3474 zcmai1OHUhD6gDAkT1#1#MK?*O&MInx^H4XajnuqUT0$ZqRYR#P*Gw7`gOSIU5}`;o z#Ms79LShpz4-;OY7(*b$`1u36Y1LI(hPiiUR*95dS3URMu`_m(mW7!+_uTXN&Uel| z^ZUVrdL8_Y_zvD&IjGa^kwy8hPJMXY01t6px2{vyUDsLHy{~g$cYSC5o&85YaKaf~ zOUoq=N21bvR!)bUf9=!hkZ%}IN5$NNmb`p5_KDw%&eo-yKt*xtG7o;n^gbOyR(!>>lUP+BlBwD9LBa)SNVNPZ=xX@WP&J zP_I97gwv|l58QC{TJO^LVlIb9l6o;07BeI4pjK4Oq`+wuiB-G+&0t6>j50&q)wY(G zc^}!HpExaMR$k`)#Py(Gb=tLr!}n1(0x!|dHVWqyZ>a2)m>)+TpOjtFpKRoiXB9;< zOg9)|K`C>iihB$Upv6JXVz%?VU1+?0`*tJXaE!ZY`$e--UmI1iW1Pcm<@+3Ead1q3 zs@jaQ7=tJnKH>Yyqj~*t4#(D&cy7<-mY>~|*U~IX@}7b$O=gg1Q!ETCQ`2&Kf%#0p zrl^CMw7M#0{E!DGj}q%>$_4My{a|(O*ijsx1%0X%$&*|hgPcP?JnV`4uKjXrlYU@8 z&v6E)-6|NGxStF+9FBgW$!M$z+3@o*B4+2!f~^k(4T{T;15c#ru=HTVz;Q=VDvlqn zE03bcw|(+USn2Nq4S;82CWL(J$oHt$RINr7ojKlRk-+F;4`M^5L`&>NR6Y!G9C|0 z$u*V%X=EF_+^pY{&a*g3L^+N-&Ka~RGeEEgNT9U3h@w8G6f`i_(9KpGXXgc{)gdrh zns0`=fhnwnf+mZQW$p{L&*PYvR2b-{8U5)-l?05Z(wYlzxk0;_$%%zA6wIQT89)ih3mlXl zEQ{H&S_iZuS&pC;yvHzDdo1Q><#dX*XagF7ezt&pqhfBA;w9$Y zVrGIs5sVpVp+Y{@dQojtw`%m_XKraZAm##6E(8IPBcA6Z21=5OFl(Yr%!nrEl<^t1 zcq0*dSRp(~Y#H$eY9d5M)S71W0?N z#tAp|?*!=`4M}5NYg+3!b4Cj%^WqImG&wAL)Ht5qfr_VHVg$Mn?SMhnED=ED4}k)> z%JJkD3;=Rz6pgJw50N(DR%64!e_k>@g2~QX?9LuXFWjCcYmeZ7tHfb<^4J%bWw&!3|8DLMU6E_osG>?TBymLW-syM#O;u~32%Ab<{7tUZ=ne78y9t-QryvkO(T z5%JKs4hRB~Dr1o1nn2M}HEWa?Z`_Nd8F#AvWb+WXB4tZ3jg7o2Mg~Z!aYVz+d9`3Qh6sGn5oj67kpv=qPmvhuoTMaoV3#FG+k-%WSp-J0F_kC@ z!CY0sPm`p%&{oc1DWen=cV;SsuoiyM#;TWuLrVh>E)yT3!}8K&HSu(UmP^x65tT}= zWK7M-|IS}!evr}XTzh+kQ-+#~P&eK+DxEG@#5<=hje$O=)mo#rJbS%o_-Z}6fA@gN z)NQkvOuG#ie(Y%b=3IxVrMdO{cGH!%%a^Wn>>mE%LR<62Yo@jf=i06|cU-#AN(9hA(Ck*g#_|^5g&pzM(r|yvb0Eqnyd?V66 literal 0 HcmV?d00001 diff --git a/src/search_company_graph/__pycache__/state.cpython-312.pyc b/src/search_company_graph/__pycache__/state.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3026151e53c69e6a414ad3468bb9c246d7125d0d GIT binary patch literal 969 zcmZ9K&ubGw6vt4`yN&OkAMeZ4sv=T1Lywvhmf?fIH}nQIx^dkdE)=c9XYgA?6x9^C zL~I61*HnsW+>DLE3Dtnqh@=fil%TjQ&*El0(n%n#=acAB@;~oX&pRDa3gR+7?`)59 zlEQqoATrZbXGU|{*L@nC1?%q0i@o=oJ4;(TPe1KGdH!R5egE}`{g<128;jj0h^*nZ zJlzS2qqnHQ9z0wWd6b4Q$^+XcO}}M}yrDa$WkO;SFq?}$BVdd=zEs3=O!`1%{xOM^ z%+w9-Gi=M4qbA|hriSk_E=rDm&uYkIjQT7jh8sA($VM?nQ+FEF6zQY#P(Z{fV=kMj zrbUuOHzh=*32C`zU{l~MA$J4amTv%+AjEVH0G0bp6bR8B$Az9)uEPmoav%h@E$$1| zg0lFw>)LP7A(+wc%j2W<8BQ6m&$$k*Pf$MRyI$R$ou$mvtvY8${ir?1oog`NYu34J z`iK`YIjhkof!~5{32oyG2O8=hY~z7WI=?UpqFia?Q}7=sulKKBYvbY0iJ>+g0!wkA zjZ2+EabXI?$;$fpYS_kSIzz+KsR%g+!QNL|xZcU+7bn^9alI=rxp5yJYEM-)HV9^+ z`Jrby4JOA|lssDx?=!2>^hb0aItI}_E9yq~&J~#`%63}1C&#+&y~NIeRN&*%<3Lpu aWg8X0qRLlv{-;(}3M str: + """入口路由:有已知候选链接则跳过搜索""" + if state.get("candidate_urls"): + return "has_url" + return "need_search" + + +def check_links(state: SearchState) -> str: + """检查是否有候选链接""" + urls = state.get("candidate_urls", []) + if not urls: + return "no_links" + return "has_links" + + +def check_verify_result(state: SearchState) -> str: + """检查验证结果""" + # 已找到招聘列表页 + if state.get("result_url"): + return "found" + + # 有错误(候选用完) + if state.get("error"): + return "failed" + + # 未找到,需要跳转 + return "need_navigate" + + +def check_navigate_result(state: SearchState) -> str: + """检查导航后的状态""" + current_url_index = state.get("current_url_index", 0) + candidate_urls = state.get("candidate_urls", []) + + # 候选用完 + if current_url_index >= len(candidate_urls): + return "no_more_candidates" + + # 回 Node 3 验证 + return "verify" + + +def create_graph() -> StateGraph: + """创建流程图""" + + graph = StateGraph(SearchState) + + # 添加节点 + graph.add_node("route_entry", lambda state: state) # 路由节点,不修改状态 + graph.add_node("search_sogou", search_sogou) + graph.add_node("extract_links", extract_links) + graph.add_node("visit_and_verify", visit_and_verify) + graph.add_node("navigate_to_recruitment", navigate_to_recruitment) + + # 入口路由 + graph.set_entry_point("route_entry") + + # 路由:有已知URL直接验证,否则走搜索 + graph.add_conditional_edges( + "route_entry", + route_entry, + { + "has_url": "visit_and_verify", + "need_search": "search_sogou" + } + ) + + # 流程 + graph.add_edge("search_sogou", "extract_links") + + graph.add_conditional_edges( + "extract_links", + check_links, + { + "no_links": END, + "has_links": "visit_and_verify" + } + ) + + graph.add_conditional_edges( + "visit_and_verify", + check_verify_result, + { + "found": END, + "failed": END, + "need_navigate": "navigate_to_recruitment" + } + ) + + graph.add_conditional_edges( + "navigate_to_recruitment", + check_navigate_result, + { + "no_more_candidates": END, + "verify": "visit_and_verify" + } + ) + + return graph.compile() diff --git a/src/search_company_graph/main.py b/src/search_company_graph/main.py new file mode 100644 index 00000000..037498fb --- /dev/null +++ b/src/search_company_graph/main.py @@ -0,0 +1,88 @@ +"""入口""" +import asyncio +import sys +from pathlib import Path + +# 支持直接运行 +if __name__ == "__main__": + sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from playwright.async_api import async_playwright +from playwright_stealth import Stealth +from src.search_company_graph.graph import create_graph + + +async def search_company_recruitment(company_name: str, input_url: str = None, headless: bool = False) -> list[str]: + """ + 搜索公司招聘列表页面 + + Args: + company_name: 公司名称 + input_url: 已知招聘地址(可选),有值时跳过搜索直接验证 + headless: 是否无头模式 + + Returns: + list[str]: 招聘列表页 URL 列表,失败返回空列表 + """ + async with async_playwright() as p: + browser = await p.chromium.launch(headless=headless, channel="chrome") + context = await browser.new_context() + + # 应用 stealth + stealth = Stealth() + await stealth.apply_stealth_async(context) + + page = await context.new_page() + + try: + graph = create_graph() + + initial_state = { + "company_name": company_name, + "page": page + } + + # 已知URL时,直接作为候选链接跳过搜索 + if input_url: + initial_state["candidate_urls"] = [input_url] + initial_state["current_url_index"] = 0 + initial_state["clicked_selectors"] = [] + initial_state["navigate_retry_count"] = 0 + + print(f"\n{'='*50}") + print(f"开始搜索: {company_name}") + print(f"{'='*50}\n") + + final_state = await graph.ainvoke(initial_state) + + print(f"\n{'='*50}") + print("搜索完成") + print(f"{'='*50}\n") + + # 返回结果 + result_url = final_state.get("result_url") + if result_url: + return [result_url] + return [] + + finally: + await browser.close() + + +async def main(): + """测试""" + company_name = "五粮液" + + result = await search_company_recruitment(company_name) + + print("\n最终结果:") + if result: + print(f"✅ 成功找到招聘页面:") + for url in result: + print(f" {url}") + else: + print("❌ 未找到招聘页面") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/search_company_graph/nodes.py b/src/search_company_graph/nodes.py new file mode 100644 index 00000000..7eadf1b3 --- /dev/null +++ b/src/search_company_graph/nodes.py @@ -0,0 +1,457 @@ +"""节点实现""" +import asyncio +from pydantic import BaseModel, Field +from langchain_core.messages import HumanMessage +from src.bash_model import GeneralLlm +from .state import SearchState +from .prompts import EXTRACT_LINKS_PROMPT, VERIFY_RECRUITMENT_LIST_PROMPT, FIND_RECRUITMENT_ENTRY_PROMPT + + +# 结构化输出模型 +class ExtractLinksResult(BaseModel): + """提取链接结果""" + urls: list[str] = Field(description="候选URL列表,最多3个") + reason: str = Field(description="选择理由") + + +class VerifyResult(BaseModel): + """验证结果""" + is_recruitment_list: bool = Field(description="是否为招聘列表页") + reason: str = Field(description="判断依据") + + +class FindEntryResult(BaseModel): + """找入口结果""" + selector: str | None = Field(description="CSS选择器,找不到则为None") + reason: str = Field(description="选择或找不到的原因") + + +async def get_clean_html(page, max_retries: int = 3) -> str: + """获取清理后的页面 HTML(移除 style/script 等),带重试逻辑""" + for attempt in range(max_retries): + try: + # 等待页面加载稳定 + try: + await page.wait_for_load_state("domcontentloaded", timeout=5000) + except Exception: + pass + + # 额外等待一下,确保页面不再跳转 + await asyncio.sleep(1) + + html = await page.evaluate(""" + () => { + const clone = document.body.cloneNode(true); + clone.querySelectorAll('style, script, noscript, svg, link').forEach(el => el.remove()); + return clone.innerHTML; + } + """) + return html + except Exception as e: + if attempt < max_retries - 1: + # 可能是页面正在跳转,等待后重试 + await asyncio.sleep(2) + else: + # 最后一次尝试失败,返回空字符串 + print(f"[get_clean_html] 获取页面内容失败: {e}") + return "" + + +async def get_search_results(page) -> str: + """提取搜狗搜索结果(结构化文本),并解析搜狗跳转链接获取真实 URL""" + # 第一步:提取搜索结果基本信息 + raw_results = await page.evaluate(""" + () => { + // 获取自然搜索结果(排除广告和推荐框) + const results = document.querySelectorAll('.vrwrap:not(.middle-better-hintBox)'); + if (!results.length) return []; + + return Array.from(results).map((el, i) => { + const title = el.querySelector('.vr-title a, h3 a')?.innerText?.trim() || ''; + + // 优先从后续的 .r-sech 元素的 data-url 获取完整 URL + let url = ''; + const nextEl = el.nextElementSibling; + if (nextEl && nextEl.classList.contains('r-sech')) { + url = nextEl.getAttribute('data-url') || ''; + } + + // 备选:从链接 href 获取 + if (!url) { + const linkEl = el.querySelector('.vr-title a, h3 a'); + url = linkEl?.href || ''; + } + + const desc = el.querySelector('.fz-mid, .star-wiki')?.innerText?.trim() || ''; + return { title, url, desc }; + }).filter(item => item.url && (item.url.startsWith('http://') || item.url.startsWith('https://'))); + } + """) + + if not raw_results: + # 回退到纯文本提取 + return await page.evaluate("() => document.body.innerText.slice(0, 30000)") + + # 第二步:解析搜狗跳转链接获取真实 URL + resolved_results = [] + for item in raw_results: + url = item.get('url', '') + + # 如果是搜狗跳转链接,尝试解析真实 URL + if 'sogou.com/link' in url: + try: + real_url = await resolve_sogou_redirect(page, url) + if real_url: + url = real_url + except Exception: + pass # 解析失败则保留原 URL + + resolved_results.append({ + 'title': item.get('title', ''), + 'url': url, + 'desc': item.get('desc', '') + }) + + # 第三步:格式化输出 + output_lines = [] + for i, item in enumerate(resolved_results): + output_lines.append(f"{i+1}. {item['title']}\nURL: {item['url']}\n{item['desc']}") + + return "\n\n".join(output_lines) + + +async def resolve_sogou_redirect(page, sogou_url: str) -> str: + """解析搜狗跳转链接,获取真实目标 URL""" + try: + # 使用 fetch 获取跳转页面内容 + response_text = await page.evaluate(""" + async (url) => { + try { + const resp = await fetch(url, { redirect: 'manual' }); + const text = await resp.text(); + return text; + } catch (e) { + return ''; + } + } + """, sogou_url) + + if not response_text: + return '' + + # 从 meta refresh 标签提取真实 URL + # 匹配模式: + import re + match = re.search(r"URL='([^']+)'", response_text, re.IGNORECASE) + if match: + return match.group(1) + + # 备选匹配模式: url=http://xxx.com + match = re.search(r'url=([^"\s>]+)', response_text, re.IGNORECASE) + if match: + return match.group(1) + + return '' + except Exception: + return '' + + +async def search_sogou(state: SearchState) -> dict: + """Node 1: 搜狗搜索""" + page = state["page"] + company_name = state["company_name"] + + print(f"[Node 1] 搜狗搜索: {company_name} 校园招聘 官网") + + # 访问搜狗 + await page.goto("https://www.sogou.com") + await asyncio.sleep(1) + + # 搜索 + await page.fill('input[name="query"]', f"{company_name} 校园招聘 官网") + await page.press('input[name="query"]', "Enter") + try: + await page.wait_for_load_state("networkidle", timeout=10000) + except Exception: + pass # 超时也继续 + await asyncio.sleep(2) + + # 获取搜索结果(结构化文本) + search_result_text = await get_search_results(page) + + print(f"[Node 1] 获取搜索结果完成,长度: {len(search_result_text)}") + + # 强制删除微公众号信息网页 + search_result_text = search_result_text.replace("mp.weixin.qq.com", "") + + return {"search_result_html": search_result_text} + + +async def extract_links(state: SearchState) -> dict: + """Node 2: 提取候选链接""" + search_text = state["search_result_html"] + company_name = state["company_name"] + + print("[Node 2] 分析搜索结果,提取候选链接...") + + # LLM 分析 + prompt = EXTRACT_LINKS_PROMPT.format(company_name=company_name, html=search_text) + llm = GeneralLlm.with_structured_output(ExtractLinksResult) + + try: + result = await llm.ainvoke([HumanMessage(content=prompt)]) + except Exception as e: + print(f"[Node 2] LLM 调用失败: {e}") + return { + "candidate_urls": [], + "current_url_index": 0, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + + # 规范化 URL(确保有协议前缀) + normalized_urls = [] + for url in result.urls: + if url and not url.startswith(('http://', 'https://')): + url = 'https://' + url + if url: + normalized_urls.append(url) + + print(f"[Node 2] 提取到 {len(normalized_urls)} 个候选链接:") + for i, url in enumerate(normalized_urls): + print(f" {i+1}. {url}") + + return { + "candidate_urls": normalized_urls, + "current_url_index": 0, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + + +async def visit_and_verify(state: SearchState) -> dict: + """Node 3: 访问并验证是否为招聘列表页""" + context = state["page"].context + candidate_urls = state.get("candidate_urls", []) + current_url_index = state.get("current_url_index", 0) + + # 确保有有效页面 + if not context.pages: + page = await context.new_page() + else: + # 只保留最新的页面 + page = context.pages[-1] + for p in context.pages[:-1]: + try: + await p.close() + except Exception: + pass + + # 检查是否还有候选链接 + if current_url_index >= len(candidate_urls): + print("[Node 3] 候选链接已用完") + return {"error": "所有候选链接均未找到招聘列表页"} + + current_url = candidate_urls[current_url_index] + clicked_selectors = state.get("clicked_selectors", []) + + # 判断是否从 Node 4 跳转回来(已经点击过入口) + if clicked_selectors: + # 从 Node 4 返回,直接使用当前页面 + print(f"[Node 3] 验证跳转后的页面: {page.url}") + page_html = await get_clean_html(page) + else: + # 首次访问该候选链接 + print(f"[Node 3] 访问候选链接 {current_url_index + 1}/{len(candidate_urls)}: {current_url}") + + # 重试机制 + max_retries = 3 + for attempt in range(max_retries): + try: + await page.goto(current_url, wait_until="domcontentloaded", timeout=20000) + # 等待页面稳定(处理可能的跳转) + await asyncio.sleep(2) + try: + await page.wait_for_load_state("networkidle", timeout=5000) + except Exception: + pass + await asyncio.sleep(1) + break + except Exception as e: + print(f"[Node 3] 访问失败 ({attempt + 1}/{max_retries}): {e}") + if attempt == max_retries - 1: + return { + "current_url": current_url, + "current_url_index": current_url_index + 1, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + await asyncio.sleep(2) + + # 获取页面内容 + page_html = await get_clean_html(page) + + # 如果获取失败,尝试下一个候选 + if not page_html: + print("[Node 3] 获取页面内容失败,尝试下一个候选") + return { + "current_url": current_url, + "current_url_index": current_url_index + 1, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + + # 截断 + display_html = page_html + # if len(display_html) > 50000: + # display_html = display_html[:50000] + "\n... (已截断)" + + # LLM 判断 + prompt = VERIFY_RECRUITMENT_LIST_PROMPT.format(html=display_html) + llm = GeneralLlm.with_structured_output(VerifyResult) + + try: + result = await llm.ainvoke([HumanMessage(content=prompt)]) + except Exception as e: + print(f"[Node 3] LLM 调用失败: {e}") + # LLM 失败,尝试下一个候选 + return { + "current_url": current_url, + "current_url_index": current_url_index + 1, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + + print(f"[Node 3] 是否为招聘列表页: {result.is_recruitment_list}") + print(f"[Node 3] 原因: {result.reason}") + + if result.is_recruitment_list: + return { + "current_url": current_url, + "page_html": page_html, + "result_url": page.url # 使用实际 URL(可能有跳转) + } + + return { + "current_url": current_url, + "page_html": page_html + } + + +async def navigate_to_recruitment(state: SearchState) -> dict: + """Node 4: 找入口并跳转""" + context = state["page"].context + + # 确保有有效页面 + if not context.pages: + # 没有页面了,换下一个候选 + current_url_index = state.get("current_url_index", 0) + return { + "current_url_index": current_url_index + 1, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + page = context.pages[-1] + + page_html = state.get("page_html", "") + clicked_selectors = state.get("clicked_selectors", []) + navigate_retry_count = state.get("navigate_retry_count", 0) + current_url_index = state.get("current_url_index", 0) + + print(f"[Node 4] 尝试找跳转入口 (第 {navigate_retry_count + 1}/10 次)") + + # 检查重试次数 + if navigate_retry_count >= 5: + print("[Node 4] 已尝试 5 次,换下一个候选链接") + return { + "current_url_index": current_url_index + 1, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + + # 检查 page_html 是否有效 + if not page_html or len(page_html) < 100: + print("[Node 4] page_html 无效,重新获取") + page_html = await get_clean_html(page) + if not page_html or len(page_html) < 100: + print("[Node 4] 页面内容仍无效,换下一个候选") + return { + "current_url_index": current_url_index + 1, + "clicked_selectors": [], + "navigate_retry_count": 0 + } + + # 截断 HTML + display_html = page_html + # if len(display_html) > 50000: + # display_html = display_html[:50000] + "\n... (已截断)" + + # LLM 分析 + clicked_str = "\n".join(clicked_selectors) if clicked_selectors else "无" + prompt = FIND_RECRUITMENT_ENTRY_PROMPT.format( + html=display_html, + clicked_selectors=clicked_str + ) + llm = GeneralLlm.with_structured_output(FindEntryResult) + + try: + result = await llm.ainvoke([HumanMessage(content=prompt)]) + except Exception as e: + print(f"[Node 4] LLM 调用失败: {e}") + return { + "page_html": page_html, + "clicked_selectors": clicked_selectors, + "navigate_retry_count": navigate_retry_count + 1 + } + + if not result.selector: + print(f"[Node 4] 未找到入口: {result.reason}") + # 找不到也计入重试,继续循环 + return { + "page_html": page_html, + "clicked_selectors": clicked_selectors, + "navigate_retry_count": navigate_retry_count + 1 + } + + print(f"[Node 4] 找到入口: {result.selector}") + print(f"[Node 4] 原因: {result.reason}") + + # 记录点击前的状态 + tabs_before = len(context.pages) + + try: + element = page.locator(result.selector).first + await element.scroll_into_view_if_needed() + await asyncio.sleep(0.2) + await element.hover() + await asyncio.sleep(0.3) + await element.click() + await asyncio.sleep(3) + except Exception as e: + print(f"[Node 4] 点击失败: {e}") + return { + "page_html": page_html, + "clicked_selectors": clicked_selectors + [result.selector], + "navigate_retry_count": navigate_retry_count + 1 + } + + # 检查是否打开新标签页 + tabs_after = len(context.pages) + if tabs_after > tabs_before: + page = context.pages[-1] + print(f"[Node 4] 打开新标签页: {page.url}") + + # 等待页面加载 + try: + await page.wait_for_load_state("networkidle", timeout=10000) + except Exception: + pass + + new_html = await get_clean_html(page) + print(f"[Node 4] 点击后页面: {page.url}") + + return { + "page_html": new_html, + "clicked_selectors": clicked_selectors + [result.selector], + "navigate_retry_count": navigate_retry_count + 1 + } diff --git a/src/search_company_graph/prompts.py b/src/search_company_graph/prompts.py new file mode 100644 index 00000000..179a68b4 --- /dev/null +++ b/src/search_company_graph/prompts.py @@ -0,0 +1,119 @@ +"""LLM 提示词""" + +# Node 2: 从搜索结果提取链接 +EXTRACT_LINKS_PROMPT = """分析以下搜狗搜索结果,找出最可能是 {company_name} 校园招聘 官方页面的链接。 + +## 搜索结果 +{html} + +## 任务 +从上面的搜索结果中提取最可能的官网页面 URL(最多3个),按可能性排序,并去重。 + +## 优先级 +1. 公司官网的招聘频道(如 careers.xxx.com, jobs.xxx.com, xxx.com/careers, talent.xxx.com) +2. 公司官网首页 + +## 排除 +- 新闻、资讯页面 +- 招聘相关但非该公司的页面 +- 第三方招聘地址 +- 和{company_name}公司无关地址 + +## 输出 +仅输出 JSON: +{{ + "urls": ["https://xxx.com/careers"], + "reason": "选择理由" #字数限制15字 +}} + +找不到则: +{{ + "urls": [], + "reason": "原因" #字数限制15字 +}} +""" + +# Node 3: 判断是否为岗位列表页 +VERIFY_RECRUITMENT_LIST_PROMPT = """判断以下页面是否为岗位列表页。 + +## 页面 HTML +{html} + +## 判断标准 +是岗位列表页(举例): +- 展示多个岗位/职位的页面 +- 平铺展示多个岗位的页面(无分页) +- 有列表结构但显示「暂无岗位」的空列表页 + +## 严格标准 +“是岗位列表页”必须满足以下条件之一: + +### 情况1:有岗位数据 +- 页面包含 >= 2 个岗位信息 +- 每个岗位至少有:岗位名称、工作地点/薪资/发布时间之一 +- 岗位以列表/卡片/表格形式展示 + +### 情况2:空列表页(无岗位数据) +必须同时满足以下所有条件: +1. 存在明确的空状态提示,如: + - "暂无岗位"、"无符合条件的职位"、"No results"、"暂无数据" + - 空列表图标 + 提示文字 +2. 存在岗位筛选/搜索功能,如: + - 岗位类型下拉框、地点筛选、关键词搜索框 +3. 页面结构简洁,主体区域明显是用于展示列表 + +### 不是岗位列表页的情况 +- 只有招聘宣传语(如"加入我们"、"企业文化"),无岗位展示区域 +- 只有招聘流程介绍,无具体岗位 +- 只有公司介绍/团队介绍 +- 只有单个岗位的详情页 +- 纯导航页/入口页(需要点击才能看到岗位) + +不是岗位列表页(举例): +- 单个岗位详情页 +- 公司介绍页 +- 招聘宣传页(无岗位列表区域) + +根据以上示例,判断当前页面更接近哪一类。 + +## 输出 +仅输出 JSON: +{{ + "is_recruitment_list": true或false, + "reason": "判断依据" #字数限制20字 +}} +""" + +# Node 4: 找跳转入口 +FIND_RECRUITMENT_ENTRY_PROMPT = """分析以下页面,找出最可能跳转到校园招聘岗位列表的元素。 + +## 页面 HTML +{html} + +## 已尝试过的选择器(避免使用) +{clicked_selectors} + +## 任务 +找到一个最可能通向招聘岗位列表的可点击元素。 + +## 可能的线索 +- 文字:招聘、加入我们、社会招聘、校园招聘、查看职位、岗位列表、人才、career、jobs、join us ... +- 搜索按钮、搜索图标(点击后可能展示岗位列表) +- 导航菜单中的相关项 +- 页面主体区域的按钮或链接 +- 即使没有明确招聘文字,也可能是通向招聘的入口 +- 根据您的理解选择可能连接到岗位列表的线索 + +## 输出 +仅输出 JSON: +{{ + "selector": "CSS选择器", + "reason": "选择原因" #字数限制30字 +}} + +找不到则: +{{ + "selector": null, + "reason": "原因" #字数限制15字 +}} +""" diff --git a/src/search_company_graph/state.py b/src/search_company_graph/state.py new file mode 100644 index 00000000..dad31eae --- /dev/null +++ b/src/search_company_graph/state.py @@ -0,0 +1,31 @@ +"""状态定义""" +from typing import TypedDict +from playwright.async_api import Page + + +class SearchState(TypedDict, total=False): + """搜索公司招聘页面的状态""" + + # 输入 + company_name: str + page: Page + + # Node 1: search_sogou + search_result_html: str + + # Node 2: extract_links + candidate_urls: list[str] # 候选链接列表(最多3个) + current_url_index: int # 当前尝试的索引 + + # Node 3: visit_and_verify + current_url: str # 当前访问的 URL + page_html: str # 当前页面 HTML + + # Node 4: navigate_to_recruitment + clicked_selectors: list[str] # 已点击过的选择器 + navigate_retry_count: int # 页内跳转重试次数(上限10次) + page_changed: bool # 点击后页面是否变化 + + # 结果 + result_url: str # 最终找到的招聘列表页 URL + error: str # 错误信息 diff --git a/test_crawler.py b/test_crawler.py new file mode 100644 index 00000000..0a6887ec --- /dev/null +++ b/test_crawler.py @@ -0,0 +1,62 @@ +"""Crawler 测试脚本""" +import asyncio +from src.crawler import crawl, CrawlerConfig + + +# 中兴招聘测试配置 +# test_config: CrawlerConfig = { +# "url": "https://app.mokahr.com/social-recruitment/zte/47588#/jobs", +# "job_item_selector": ".jobs-list-WmE84RgZxp .container-aOp138AX_X.normal-TBuWTpDMcE.list-oR2doUijv4", +# "item_change_type": "redirect", +# "next_page_selector": ".sd-Pagination-pagination-2kuN2 .sd-Pagination-forward-3z80f", +# "page_change_type": "url_change", +# "field_selectors": { +# "job_title": {"selector": [".title-ROUQFdjmhP"]}, +# "description": {"selector": [".job-description-VvfEUGocNE"]}, +# "location": {"selector": [".info-UcB_mxJq8y span:first-child"]}, +# "company": {"selector": [".basic-info-dB86EjV5uU span:nth-child(2)"]}, +# }, +# "detail_area_selector": None, +# } + +# 美宜佳招聘测试配置 +# test_config: CrawlerConfig = { +# "url": "https://meiyijia.jobs.feishu.cn/social/position/list", +# "job_item_selector": ".listItems__fca8c0 a", +# "item_change_type": "new_tab", +# "next_page_selector": ".pager__fca8c0 .atsx-pagination-next:not(.atsx-pagination-disabled)", +# "page_change_type": "url_change", +# "field_selectors": { +# "job_title": {"selector": [".positionItem-title-text"]}, +# "description": {"selector": [".positionItem-jobDesc"]}, +# "location": {"selector": [".positionItem-subTitle span"]}, +# }, +# "detail_area_selector": None, +# } + +# 三星招聘测试配置 +test_config: CrawlerConfig = { + "url": "https://dearsamsung.zhiye.com/#/samsung/pc/szzw", + "job_item_selector": ".BHGkB li", + "item_change_type": "in_page", + "next_page_selector": "._8x6MD .ant-pagination-next:not([aria-disabled='true']) .ant-pagination-item-link", + "page_change_type": "content_change", + "field_selectors": { + "job_title": {"selector": ["h2"]}, + "description": {"selector": ['.aCl-8 p', '.aCl-8 pre']}, + }, + "detail_area_selector": ".FLf6j", +} + + +async def main(): + results = await crawl(test_config, headless=False) + print(f"\n爬取完成,共 {len(results)} 条数据") + for i, item in enumerate(results): + print(f"\n--- 岗位 {i+1} ---") + for k, v in item.items(): + print(f"{k}: {v}") + + +if __name__ == "__main__": + asyncio.run(main())