1- """Validate Gherkin tool component — syntax parsing and lint checks."""
1+ """Validate Gherkin tool component — syntax parsing and structural lint checks.
2+
3+ Tier 1: Syntax validation via gherkin-official parser.
4+ Tier 2: Structural lint checks via pure Python against the parsed AST.
5+ No external lint tools or subprocess calls required.
6+ """
27
38from __future__ import annotations
49
5- import base64
610import json
711import logging
8- import shlex
9- import uuid
1012
1113from langchain_core .tools import tool
1214
1921def validate_gherkin_factory (node ):
2022 """Return a LangChain tool that validates Gherkin .feature specs."""
2123
22- # Resolve parent workspace and build sandbox backend (same pattern as run_command)
23- from components .run_command import _resolve_parent_workspace
24- from components ._agent_shared import _build_backend
25-
26- parent_extra = _resolve_parent_workspace (node )
27- backend = None
28- if parent_extra .get ("workspace_id" ):
29- try :
30- backend = _build_backend (parent_extra )
31- logger .info (
32- "validate_gherkin %s: using sandbox backend (workspace_id=%s)" ,
33- node .node_id ,
34- parent_extra ["workspace_id" ],
35- )
36- except Exception :
37- logger .warning (
38- "validate_gherkin %s: failed to build sandbox backend, lint checks will be skipped" ,
39- node .node_id ,
40- exc_info = True ,
41- )
42-
4324 @tool
4425 def validate_gherkin (gherkin_spec : str ) -> str :
4526 """Validate a Gherkin feature spec for syntax errors and lint warnings.
@@ -71,15 +52,13 @@ def validate_gherkin(gherkin_spec: str) -> str:
7152 from gherkin .parser import Parser
7253
7354 parser = Parser ()
74- parser .parse (gherkin_spec )
55+ doc = parser .parse (gherkin_spec )
7556 except Exception as e :
7657 result ["valid" ] = False
7758 error_info = {"message" : str (e ), "line" : 0 }
78- # Try to extract line number from the error message
7959 err_str = str (e )
8060 if "(" in err_str and ":" in err_str :
8161 try :
82- # gherkin-official errors often contain "(line:col)" patterns
8362 parts = err_str .split ("(" )
8463 for part in parts :
8564 if ":" in part and ")" in part :
@@ -92,106 +71,134 @@ def validate_gherkin(gherkin_spec: str) -> str:
9271 result ["parse_errors" ].append (error_info )
9372 return json .dumps (result )
9473
95- # ── Tier 2: Lint via gherlint CLI (sandboxed) ────────────────────
96- if backend is not None :
97- try :
98- # Encode content as base64 so arbitrary Gherkin can be safely
99- # embedded in a shell command without quoting issues.
100- encoded = base64 .b64encode (gherkin_spec .encode ()).decode ()
101- temp_filename = f"/tmp/_validate_gherkin_{ uuid .uuid4 ().hex } .feature"
102- cmd = (
103- f"echo { shlex .quote (encoded )} | base64 -d > { shlex .quote (temp_filename )} "
104- f" && gherlint lint { shlex .quote (temp_filename )} "
105- f"; STATUS=$?; rm -f { shlex .quote (temp_filename )} ; exit $STATUS"
106- )
107- resp = backend .execute (cmd , timeout = 30 )
108- _parse_gherlint_output (resp .output or "" , "" , resp .exit_code or 0 , result )
109- except Exception :
110- logger .warning ("gherlint lint failed" , exc_info = True )
111- else :
112- logger .debug ("validate_gherkin: no sandbox backend, skipping lint checks" )
74+ # ── Tier 2: Structural lint checks against parsed AST ────────────
75+ _lint_ast (doc , result )
11376
11477 return json .dumps (result )
11578
11679 return validate_gherkin
11780
11881
119- def _parse_gherlint_output (
120- stdout : str , stderr : str , returncode : int , result : dict
121- ) -> None :
122- """Parse gherlint CLI output and populate result dict."""
123- output = (stdout + "\n " + stderr ).strip ()
124- if not output :
82+ def _lint_ast (doc : dict , result : dict ) -> None :
83+ """Run structural lint checks against a parsed Gherkin AST."""
84+ feature = doc .get ("feature" )
85+
86+ if not feature :
87+ result ["lint_errors" ].append ({
88+ "code" : "E001" ,
89+ "message" : "No Feature block found" ,
90+ "line" : 0 ,
91+ })
92+ result ["valid" ] = False
12593 return
12694
127- for line in output .splitlines ():
128- line = line .strip ()
129- if not line :
130- continue
95+ # Check feature has a name
96+ if not feature .get ("name" , "" ).strip ():
97+ result ["lint_warnings" ].append ({
98+ "code" : "W001" ,
99+ "message" : "Feature has no name" ,
100+ "line" : feature .get ("location" , {}).get ("line" , 0 ),
101+ })
102+
103+ children = feature .get ("children" , [])
104+ scenarios = [c for c in children if "scenario" in c ]
105+ backgrounds = [c for c in children if "background" in c ]
106+
107+ # Check feature has scenarios
108+ if not scenarios :
109+ result ["lint_warnings" ].append ({
110+ "code" : "W002" ,
111+ "message" : "Feature has no scenarios" ,
112+ "line" : feature .get ("location" , {}).get ("line" , 0 ),
113+ })
114+ return
115+
116+ # Check for duplicate scenario names
117+ seen_names : dict [str , int ] = {}
118+ for child in scenarios :
119+ sc = child ["scenario" ]
120+ name = sc .get ("name" , "" ).strip ()
121+ line = sc .get ("location" , {}).get ("line" , 0 )
122+ if name :
123+ if name in seen_names :
124+ result ["lint_warnings" ].append ({
125+ "code" : "W003" ,
126+ "message" : f"Duplicate scenario name: '{ name } ' (first at line { seen_names [name ]} )" ,
127+ "line" : line ,
128+ })
129+ else :
130+ seen_names [name ] = line
131+
132+ # Check each scenario
133+ for child in scenarios :
134+ sc = child ["scenario" ]
135+ name = sc .get ("name" , "" ).strip ()
136+ line = sc .get ("location" , {}).get ("line" , 0 )
137+ steps = sc .get ("steps" , [])
138+
139+ # Unnamed scenario
140+ if not name :
141+ result ["lint_warnings" ].append ({
142+ "code" : "W004" ,
143+ "message" : "Scenario has no name" ,
144+ "line" : line ,
145+ })
131146
132- # gherlint output format is typically:
133- # filename:line:col: CODE message
134- # or just warning/error messages
135- entry = _parse_lint_line (line )
136- if entry is None :
147+ # Empty scenario
148+ if not steps :
149+ result ["lint_warnings" ].append ({
150+ "code" : "W005" ,
151+ "message" : f"Scenario '{ name or '(unnamed)' } ' has no steps" ,
152+ "line" : line ,
153+ })
137154 continue
138155
139- code = entry .get ("code" , "" )
140- # Convention: Cxxx = convention, Wxxx = warning, Exxx = error
141- if code .startswith ("E" ):
142- result ["lint_errors" ].append (entry )
143- result ["valid" ] = False
144- else :
145- # W, C, and other codes are treated as warnings
146- result ["lint_warnings" ].append (entry )
147-
148-
149- def _parse_lint_line (line : str ) -> dict | None :
150- """Parse a single gherlint output line into a structured dict.
151-
152- Expected formats:
153- filename.feature:10:1: C0101 Step should start with a capital letter
154- filename.feature:5: W0301 Scenario has no Given step
155- C0101: Step should start with a capital letter (line 10)
156- """
157- # Format: path:line:col: CODE message
158- parts = line .split (":" , maxsplit = 3 )
159- if len (parts ) >= 4 :
160- try :
161- line_no = int (parts [1 ].strip ())
162- remainder = parts [3 ].strip ()
163- code , _ , message = remainder .partition (" " )
164- if code and code [0 ].isalpha () and any (c .isdigit () for c in code ):
165- return {"code" : code , "message" : message .strip (), "line" : line_no }
166- except (ValueError , IndexError ):
167- pass
168-
169- # Format: path:line: CODE message (no column)
170- if len (parts ) >= 3 :
171- try :
172- line_no = int (parts [1 ].strip ())
173- remainder = parts [2 ].strip ()
174- code , _ , message = remainder .partition (" " )
175- if code and code [0 ].isalpha () and any (c .isdigit () for c in code ):
176- return {"code" : code , "message" : message .strip (), "line" : line_no }
177- except (ValueError , IndexError ):
178- pass
179-
180- # Format: CODE: message (line N) or CODE message
181- if line and line [0 ].isalpha ():
182- code_part = line .split ()[0 ].rstrip (":" )
183- if any (c .isdigit () for c in code_part ):
184- message = line [len (code_part ):].strip ().lstrip (": " )
185- line_no = 0
186- # Try to extract (line N) from message
187- if "(line" in message :
188- try :
189- idx = message .index ("(line" )
190- num = message [idx + 5 :].split (")" )[0 ].strip ()
191- line_no = int (num )
192- message = message [:idx ].strip ()
193- except (ValueError , IndexError ):
194- pass
195- return {"code" : code_part , "message" : message , "line" : line_no }
156+ # Extract keyword types (Given, When, Then, And, But, *)
157+ keywords = [s .get ("keyword" , "" ).strip () for s in steps ]
158+
159+ # Check for missing Given
160+ if "Given" not in keywords :
161+ result ["lint_warnings" ].append ({
162+ "code" : "C001" ,
163+ "message" : f"Scenario '{ name or '(unnamed)' } ' has no Given step" ,
164+ "line" : line ,
165+ })
166+
167+ # Check for missing When
168+ if "When" not in keywords :
169+ result ["lint_warnings" ].append ({
170+ "code" : "C002" ,
171+ "message" : f"Scenario '{ name or '(unnamed)' } ' has no When step" ,
172+ "line" : line ,
173+ })
174+
175+ # Check for missing Then
176+ if "Then" not in keywords :
177+ result ["lint_warnings" ].append ({
178+ "code" : "C003" ,
179+ "message" : f"Scenario '{ name or '(unnamed)' } ' has no Then step" ,
180+ "line" : line ,
181+ })
182+
183+ # Check backgrounds
184+ for child in backgrounds :
185+ bg = child ["background" ]
186+ bg_steps = bg .get ("steps" , [])
187+ bg_line = bg .get ("location" , {}).get ("line" , 0 )
188+
189+ if not bg_steps :
190+ result ["lint_warnings" ].append ({
191+ "code" : "W006" ,
192+ "message" : "Background has no steps" ,
193+ "line" : bg_line ,
194+ })
196195
197- return None
196+ # Background should only contain Given steps
197+ for step in bg_steps :
198+ kw = step .get ("keyword" , "" ).strip ()
199+ if kw not in ("Given" , "And" , "But" , "*" ):
200+ result ["lint_warnings" ].append ({
201+ "code" : "C004" ,
202+ "message" : f"Background contains non-Given step: '{ kw } '" ,
203+ "line" : step .get ("location" , {}).get ("line" , 0 ),
204+ })
0 commit comments