1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
| import os import re import feedparser import urllib.parse import requests from bs4 import BeautifulSoup from flask import Flask, request, jsonify from flask_cors import CORS from langchain_tavily import TavilySearch from langchain.agents import AgentExecutor, create_tool_calling_agent, tool from langchain_core.prompts import ChatPromptTemplate from langchain.chat_models import init_chat_model import jieba import jieba.analyse from dotenv import load_dotenv import logging from datetime import datetime
load_dotenv(override=True)
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__)
app = Flask(__name__)
CORS(app, origins=[ "http://smallgoodgood.top", "https://smallgoodgood.top", "http://www.smallgoodgood.top", "https://www.smallgoodgood.top", "http://localhost:*" ])
search = TavilySearch(max_results=3)
jieba.initialize()
def fetch_blog_content(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') content_selectors = [ 'article', '.post-content', '.entry-content', '.blog-content', 'div[class*="content"]', 'main' ] content = None for selector in content_selectors: content_element = soup.select_one(selector) if content_element: content = content_element.get_text(strip=False) break if not content: body = soup.find('body') if body: for element in body(['nav', 'footer', 'header', 'aside', 'script', 'style']): element.decompose() content = body.get_text(strip=False) if content: content = re.sub(r'\n\s*\n', '\n\n', content) content = re.sub(r'^\s+', '', content, flags=re.MULTILINE) return content[:10000] return "无法提取文章内容" except Exception as e: return f"抓取文章内容时出错: {str(e)}"
@tool def search_blog_rss(query: str): try: feed = feedparser.parse("https://smallgoodgood.top/rss.xml") chinese_keywords = jieba.analyse.extract_tags(query, topK=5, withWeight=False) english_words = set(re.findall(r'[a-zA-Z0-9]{3,}', query.lower())) query_terms = set(chinese_keywords) | english_words query_lower = query.lower() results = [] for entry in feed.entries: title = entry.title description = entry.get('description', '') if description: description = re.sub(r'<[^>]+>', '', description) search_text = f"{title} {description}".lower() title_lower = title.lower() score = 0 if any(term in title_lower for term in query_terms if len(term) > 1): score += 20 elif any(term in search_text for term in query_terms if len(term) > 1): score += 10 if query_lower in search_text: score += 15 content_keywords = set(jieba.analyse.extract_tags( f"{title} {description}", topK=10, withWeight=False )) keyword_match = len(query_terms & content_keywords) score += keyword_match * 3 if score > 0: published = entry.get('published', entry.get('pubDate', '未知日期')) full_content = fetch_blog_content(entry.link) results.append({ 'title': title, 'link': entry.link, 'published': published, 'summary': description[:200] + '...' if description else "暂无摘要", 'full_content': full_content, 'score': score }) results.sort(key=lambda x: x['score'], reverse=True) if results: response = f"在博客中找到 {len(results)} 篇相关文章:\n\n" for i, art in enumerate(results[:3], 1): response += f"{i}. **{art['title']}** (相关度: {art['score']})\n" response += f" 发布时间: {art['published']}\n" response += f" 摘要: {art['summary']}\n" response += f" 链接: {art['link']}\n\n" if len(art['full_content']) > 500: response += f" 内容预览: {art['full_content'][:500]}...\n\n" else: response += f" 内容: {art['full_content']}\n\n" return response else: return "在博客中没有找到相关文章。您可以尝试使用其他关键词搜索。" except Exception as e: return f"读取博客RSS时出错: {str(e)}"
@tool def get_blog_article(url: str): try: content = fetch_blog_content(url) return f"文章内容:\n\n{content}" except Exception as e: return f"获取文章内容时出错: {str(e)}"
tools = [search_blog_rss, get_blog_article]
prompt = ChatPromptTemplate.from_messages([ ("system", """你是一名助人为乐的助手,可以搜索smallgoodgood.top博客的内容来获取信息。
重要指令: 1. 当用户询问技术问题时,先使用search_blog_rss搜索博客中是否有相关文章 2. 找到相关文章后,使用get_blog_article获取文章的完整内容 3. 基于文章的完整内容进行详细回答,不要只依赖摘要 4. 引用具体的技术细节和方法 5. 如果文章内容不够详细,可以补充一般性的技术知识
请确保基于真实内容提供准确的回答。"""), ("human", "{input}"), ("placeholder", "{agent_scratchpad}"), ])
model = init_chat_model("deepseek-chat", model_provider="deepseek")
agent = create_tool_calling_agent(model, tools, prompt) agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
@app.route('/health', methods=['GET']) def health_check(): return jsonify({ "status": "healthy", "service": "Chat Zenith Langchain Service", "timestamp": datetime.now().isoformat() })
@app.route('/chat', methods=['POST', 'OPTIONS']) def chat(): if request.method == 'OPTIONS': return '', 204 try: data = request.get_json() if not data or 'messages' not in data: return jsonify({ "error": "请求格式错误,需要包含messages字段" }), 400 messages = data['messages'] user_message = None for msg in reversed(messages): if msg.get('role') == 'user': user_message = msg.get('content', '') break if not user_message: return jsonify({ "error": "没有找到用户消息" }), 400 logger.info(f"收到用户消息: {user_message}") result = agent_executor.invoke({"input": user_message}) response_data = { "choices": [{ "message": { "content": result['output'], "role": "assistant" } }], "model": "langchain-deepseek" } logger.info(f"生成回复: {result['output'][:100]}...") return jsonify(response_data) except Exception as e: logger.error(f"处理请求时出错: {str(e)}", exc_info=True) return jsonify({ "error": f"服务器内部错误: {str(e)}" }), 500
@app.errorhandler(404) def not_found(error): return jsonify({ "error": "端点不存在" }), 404
@app.errorhandler(500) def internal_error(error): logger.error(f"内部服务器错误: {str(error)}") return jsonify({ "error": "内部服务器错误" }), 500
if __name__ == '__main__': logger.info("启动Chat Zenith Web服务...") app.run( host='0.0.0.0', port=5000, debug=False )
|