|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 3 | +# or more contributor license agreements. See the NOTICE file |
| 4 | +# distributed with this work for additional information |
| 5 | +# regarding copyright ownership. The ASF licenses this file |
| 6 | +# to you under the Apache License, Version 2.0 (the |
| 7 | +# "License"); you may not use this file except in compliance |
| 8 | +# with the License. You may obtain a copy of the License at |
| 9 | +# |
| 10 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +# |
| 12 | +# Unless required by applicable law or agreed to in writing, software |
| 13 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +# See the License for the specific language governing permissions and |
| 16 | +# limitations under the License. |
| 17 | +""" |
| 18 | +Natural Language to SQL agent for Apache Hive Beeline. |
| 19 | +
|
| 20 | +Spawns the Metastore MCP Server as a subprocess, connects as an MCP client, |
| 21 | +and uses LangChain + Claude to discover schema and generate HiveQL. |
| 22 | +Prints only the generated SQL to stdout. |
| 23 | +""" |
| 24 | + |
| 25 | +import argparse |
| 26 | +import asyncio |
| 27 | +import os |
| 28 | +import sys |
| 29 | +import re |
| 30 | + |
| 31 | +from mcp import ClientSession, StdioServerParameters |
| 32 | +from mcp.client.stdio import stdio_client |
| 33 | +from langchain_anthropic import ChatAnthropic |
| 34 | +from langchain_core.messages import HumanMessage, SystemMessage |
| 35 | + |
| 36 | + |
| 37 | +async def get_schema_via_mcp(mcp_server_script, database): |
| 38 | + """Spawn the Metastore MCP server and call get_table_schema_sql tool.""" |
| 39 | + metastore_url = os.environ.get('METASTORE_REST_URL', 'http://localhost:9001/iceberg') |
| 40 | + |
| 41 | + server_params = StdioServerParameters( |
| 42 | + command='python3', |
| 43 | + args=[mcp_server_script], |
| 44 | + env={**os.environ, 'METASTORE_REST_URL': metastore_url}, |
| 45 | + ) |
| 46 | + |
| 47 | + async with stdio_client(server_params) as (read, write): |
| 48 | + async with ClientSession(read, write) as session: |
| 49 | + await session.initialize() |
| 50 | + |
| 51 | + # Call the get_table_schema_sql tool |
| 52 | + result = await session.call_tool( |
| 53 | + 'get_table_schema_sql', |
| 54 | + arguments={'database': database} |
| 55 | + ) |
| 56 | + |
| 57 | + # Extract text content from the result |
| 58 | + schema_text = '' |
| 59 | + for content in result.content: |
| 60 | + if hasattr(content, 'text'): |
| 61 | + schema_text += content.text |
| 62 | + return schema_text |
| 63 | + |
| 64 | + |
| 65 | +def generate_sql(schema_info, nl_query, database): |
| 66 | + """Use LangChain + Claude to convert natural language to HiveQL.""" |
| 67 | + base_url = os.environ.get('ANTHROPIC_BASE_URL', 'https://api.anthropic.com') |
| 68 | + api_key = os.environ.get('ANTHROPIC_AUTH_TOKEN', |
| 69 | + os.environ.get('ANTHROPIC_API_KEY', '')) |
| 70 | + model = os.environ.get('ANTHROPIC_MODEL', 'claude-sonnet-4-20250514') |
| 71 | + |
| 72 | + llm = ChatAnthropic( |
| 73 | + model=model, |
| 74 | + anthropic_api_url=base_url, |
| 75 | + anthropic_api_key=api_key, |
| 76 | + max_tokens=1024, |
| 77 | + temperature=0, |
| 78 | + ) |
| 79 | + |
| 80 | + system_prompt = f"""You are a HiveQL expert. Convert the user's natural language request into a valid HiveQL query. |
| 81 | +
|
| 82 | +RULES: |
| 83 | +- Output ONLY the SQL query, nothing else. No markdown, no explanation, no code fences. |
| 84 | +- Use HiveQL syntax (not MySQL or PostgreSQL). |
| 85 | +- The current database is `{database}`. |
| 86 | +- Use ONLY the tables and columns listed in the schema below. Do NOT reference tables that don't exist. |
| 87 | +- If the request cannot be answered with the available schema, write the closest possible query using the actual tables. |
| 88 | +- Always include a LIMIT clause for SELECT queries unless the user explicitly asks for all rows. |
| 89 | +
|
| 90 | +SCHEMA: |
| 91 | +{schema_info}""" |
| 92 | + |
| 93 | + messages = [ |
| 94 | + SystemMessage(content=system_prompt), |
| 95 | + HumanMessage(content=nl_query), |
| 96 | + ] |
| 97 | + |
| 98 | + response = llm.invoke(messages) |
| 99 | + sql = response.content.strip() |
| 100 | + # Strip markdown code fences if the model wraps the output |
| 101 | + sql = re.sub(r'^```(?:sql)?\s*', '', sql) |
| 102 | + sql = re.sub(r'\s*```$', '', sql) |
| 103 | + return sql.strip() |
| 104 | + |
| 105 | + |
| 106 | +async def async_main(args): |
| 107 | + # Locate the MCP server script |
| 108 | + # Try HIVE_HOME first, then relative to source tree |
| 109 | + hive_home = os.environ.get('HIVE_HOME', '') |
| 110 | + candidates = [ |
| 111 | + os.path.join(hive_home, 'scripts', 'metastore', 'mcp-server', |
| 112 | + 'metastore_mcp_server.py'), |
| 113 | + os.path.join(os.path.dirname(os.path.abspath(__file__)), |
| 114 | + '..', 'metastore', 'mcp-server', 'metastore_mcp_server.py'), |
| 115 | + os.path.join(os.path.dirname(os.path.abspath(__file__)), |
| 116 | + '..', '..', '..', 'standalone-metastore', 'metastore-tools', |
| 117 | + 'mcp-server', 'metastore_mcp_server.py'), |
| 118 | + ] |
| 119 | + mcp_server_script = None |
| 120 | + for candidate in candidates: |
| 121 | + candidate = os.path.normpath(candidate) |
| 122 | + if os.path.exists(candidate): |
| 123 | + mcp_server_script = candidate |
| 124 | + break |
| 125 | + |
| 126 | + if mcp_server_script is None: |
| 127 | + print(f'Warning: MCP server not found in any known location', |
| 128 | + file=sys.stderr) |
| 129 | + schema_info = '(Schema not available - MCP server not found)' |
| 130 | + else: |
| 131 | + try: |
| 132 | + schema_info = await get_schema_via_mcp(mcp_server_script, args.database) |
| 133 | + except Exception as e: |
| 134 | + print(f'Warning: MCP schema discovery failed: {e}', file=sys.stderr) |
| 135 | + schema_info = '(Schema not available)' |
| 136 | + |
| 137 | + # Generate SQL |
| 138 | + try: |
| 139 | + sql = generate_sql(schema_info, args.query, args.database) |
| 140 | + except Exception as e: |
| 141 | + print(f'Error generating SQL: {e}', file=sys.stderr) |
| 142 | + sys.exit(1) |
| 143 | + |
| 144 | + # Print ONLY the SQL to stdout |
| 145 | + print(sql) |
| 146 | + |
| 147 | + |
| 148 | +def main(): |
| 149 | + parser = argparse.ArgumentParser(description='Natural Language to HiveQL') |
| 150 | + parser.add_argument('--query', required=True, |
| 151 | + help='Natural language query') |
| 152 | + parser.add_argument('--database', default='default', |
| 153 | + help='Current database name') |
| 154 | + parser.add_argument('--mcp-url', |
| 155 | + default=os.environ.get('METASTORE_REST_URL', |
| 156 | + 'http://localhost:9001/iceberg'), |
| 157 | + help='Metastore REST Catalog URL (including path prefix)') |
| 158 | + args = parser.parse_args() |
| 159 | + |
| 160 | + # Set env var so the MCP server picks it up |
| 161 | + os.environ['METASTORE_REST_URL'] = args.mcp_url.rstrip('/') |
| 162 | + |
| 163 | + asyncio.run(async_main(args)) |
| 164 | + |
| 165 | + |
| 166 | +if __name__ == '__main__': |
| 167 | + main() |
0 commit comments