diff --git a/src/indexer/graphrag/database.rs b/src/indexer/graphrag/database.rs index 4340fb0..111ec73 100644 --- a/src/indexer/graphrag/database.rs +++ b/src/indexer/graphrag/database.rs @@ -192,23 +192,38 @@ impl<'a> DatabaseOperations<'a> { let desc_array = extract_string_column(&rel_batch, "description")?; let conf_array = extract_f32_column(&rel_batch, "confidence")?; - // Process each relationship + // Deduplicate relationships by (source, target, type) triple — + // batch writes can produce duplicates across incremental flushes + let mut seen = std::collections::HashSet::new(); + let mut dedup_count = 0usize; for i in 0..rel_batch.num_rows() { + let source = source_array.value(i); + let target = target_array.value(i); + let rel_type = type_array.value(i); + let key = (source.to_string(), target.to_string(), rel_type.to_string()); + if !seen.insert(key) { + dedup_count += 1; + continue; + } let relationship = CodeRelationship { - source: source_array.value(i).to_string(), - target: target_array.value(i).to_string(), - relation_type: type_array - .value(i) + source: source.to_string(), + target: target.to_string(), + relation_type: rel_type .parse() .unwrap_or(crate::indexer::graphrag::types::RelationType::Imports), description: desc_array.value(i).to_string(), confidence: conf_array.value(i), - weight: 1.0, // Default weight for legacy relationships + weight: 1.0, }; - - // Add to graph graph.relationships.push(relationship); } + if dedup_count > 0 && !quiet { + println!( + " Deduplicated {} → {} unique relationships", + rel_batch.num_rows(), + graph.relationships.len() + ); + } } if !graph.nodes.is_empty() && !quiet { diff --git a/src/indexer/graphrag/relationships.rs b/src/indexer/graphrag/relationships.rs index 3bad307..5743513 100644 --- a/src/indexer/graphrag/relationships.rs +++ b/src/indexer/graphrag/relationships.rs @@ -32,30 +32,36 @@ impl RelationshipDiscovery { for source_file in new_files { // 1. Import/Export relationships (high confidence) - for import in &source_file.imports { - for target_file in all_nodes { - if target_file.id == source_file.id { - continue; - } + // For markdown files, use proper path-based import resolution + // (the symbol-matching approach doesn't work for file path imports) + if source_file.language == "markdown" { + Self::discover_import_relationships(source_file, all_nodes, &mut relationships); + } else { + for import in &source_file.imports { + for target_file in all_nodes { + if target_file.id == source_file.id { + continue; + } - // Check if target exports what source imports - if target_file - .exports - .iter() - .any(|exp| symbols_match(import, exp)) - || target_file - .symbols + // Check if target exports what source imports + if target_file + .exports .iter() - .any(|sym| symbols_match(import, sym)) - { - relationships.push(CodeRelationship { - source: source_file.id.clone(), - target: target_file.id.clone(), - relation_type: crate::indexer::graphrag::types::RelationType::Imports, - description: format!("Imports {} from {}", import, target_file.name), - confidence: 0.9, - weight: 1.0, - }); + .any(|exp| symbols_match(import, exp)) + || target_file + .symbols + .iter() + .any(|sym| symbols_match(import, sym)) + { + relationships.push(CodeRelationship { + source: source_file.id.clone(), + target: target_file.id.clone(), + relation_type: crate::indexer::graphrag::types::RelationType::Imports, + description: format!("Imports {} from {}", import, target_file.name), + confidence: 0.9, + weight: 1.0, + }); + } } } } @@ -193,16 +199,26 @@ impl RelationshipDiscovery { { // Find the target node if let Some(target_node) = file_map.get(&resolved_path) { - // Create semantic import relationship + // Use References for markdown cross-links, Imports for code + let rel_type = if source_file.language == "markdown" { + crate::indexer::graphrag::types::RelationType::References + } else { + crate::indexer::graphrag::types::RelationType::Imports + }; + let description_prefix = if source_file.language == "markdown" { + "References" + } else { + "Direct import" + }; relationships.push(CodeRelationship { source: source_file.id.clone(), target: target_node.id.clone(), - relation_type: crate::indexer::graphrag::types::RelationType::Imports, + relation_type: rel_type, description: format!( - "Direct import: {} -> {}", - import_path, resolved_path + "{}: {} -> {}", + description_prefix, import_path, resolved_path ), - confidence: 0.95, // High confidence for resolved imports + confidence: 0.95, weight: 1.0, }); @@ -482,10 +498,9 @@ impl RelationshipDiscovery { || relative_path.contains(".test.") { "test_file".to_string() - } else if relative_path.ends_with(".md") - || relative_path.ends_with(".txt") - || relative_path.ends_with(".rst") - { + } else if relative_path.ends_with(".md") || relative_path.ends_with(".markdown") { + "document_file".to_string() + } else if relative_path.ends_with(".txt") || relative_path.ends_with(".rst") { "documentation".to_string() } else if relative_path.contains("/config") || relative_path.contains(".config") { "config_file".to_string() diff --git a/src/indexer/graphrag/tests.rs b/src/indexer/graphrag/tests.rs index 891e022..182668b 100644 --- a/src/indexer/graphrag/tests.rs +++ b/src/indexer/graphrag/tests.rs @@ -954,4 +954,101 @@ def _private_function(): extract_imports_exports_recursive(child, contents, lang_impl, all_imports, all_exports); } } + + /// Integration test: verify discover_relationships_efficiently produces + /// References edges for markdown nodes with file-path imports. + /// This is the exact code path used at runtime (not discover_import_relationships directly). + #[tokio::test] + async fn test_markdown_references_via_efficient_discovery() { + use crate::indexer::graphrag::types::RelationType; + + // Source: adapters-integrations.md imports credit-suite.md and credit-accounts.md + let source = CodeNode { + id: "projects/docs/core/adapters-integrations.md".to_string(), + name: "adapters-integrations".to_string(), + kind: "document_file".to_string(), + path: "projects/docs/core/adapters-integrations.md".to_string(), + description: String::new(), + symbols: vec![], + imports: vec![ + "credit-suite.md".to_string(), // same-dir + "../intro/credit-accounts.md".to_string(), // parent-dir + ], + exports: vec!["Adapters".to_string()], + functions: vec![], + hash: "aaa".to_string(), + embedding: vec![], + size_lines: 50, + language: "markdown".to_string(), + }; + + let target1 = CodeNode { + id: "projects/docs/core/credit-suite.md".to_string(), + name: "credit-suite".to_string(), + kind: "document_file".to_string(), + path: "projects/docs/core/credit-suite.md".to_string(), + description: String::new(), + symbols: vec![], + imports: vec![], + exports: vec!["Credit Suite".to_string()], + functions: vec![], + hash: "bbb".to_string(), + embedding: vec![], + size_lines: 100, + language: "markdown".to_string(), + }; + + let target2 = CodeNode { + id: "projects/docs/intro/credit-accounts.md".to_string(), + name: "credit-accounts".to_string(), + kind: "document_file".to_string(), + path: "projects/docs/intro/credit-accounts.md".to_string(), + description: String::new(), + symbols: vec![], + imports: vec![], + exports: vec!["Credit Accounts".to_string()], + functions: vec![], + hash: "ccc".to_string(), + embedding: vec![], + size_lines: 80, + language: "markdown".to_string(), + }; + + let all_nodes = vec![source.clone(), target1.clone(), target2.clone()]; + let new_files = vec![source.clone()]; + + // Call the SAME function used at runtime + let relationships = RelationshipDiscovery::discover_relationships_efficiently( + &new_files, + &all_nodes, + ) + .await + .expect("relationship discovery should succeed"); + + // Find References relationships (not just sibling_module) + let refs: Vec<_> = relationships + .iter() + .filter(|r| r.relation_type == RelationType::References) + .collect(); + + assert!( + refs.len() >= 2, + "Expected at least 2 References relationships, got {}: {:?}", + refs.len(), + refs.iter().map(|r| format!("{} -> {}", r.source, r.target)).collect::>() + ); + + // Verify specific edges + let has_suite = refs.iter().any(|r| { + r.source == "projects/docs/core/adapters-integrations.md" + && r.target == "projects/docs/core/credit-suite.md" + }); + assert!(has_suite, "Should have reference to credit-suite.md"); + + let has_accounts = refs.iter().any(|r| { + r.source == "projects/docs/core/adapters-integrations.md" + && r.target == "projects/docs/intro/credit-accounts.md" + }); + assert!(has_accounts, "Should have reference to credit-accounts.md"); + } } diff --git a/src/indexer/graphrag/types.rs b/src/indexer/graphrag/types.rs index 9fd8799..298d06a 100644 --- a/src/indexer/graphrag/types.rs +++ b/src/indexer/graphrag/types.rs @@ -60,6 +60,8 @@ pub enum RelationType { StrategyPattern, /// Adapter pattern (interface adaptation) AdapterPattern, + /// Document cross-reference via markdown link + References, // Low importance - Organizational relationships (weight: 0.3) /// Files in the same directory @@ -85,6 +87,7 @@ impl RelationType { | Self::ObserverPattern | Self::StrategyPattern | Self::AdapterPattern => 0.8, + Self::References => 0.6, // Low importance - organizational structure Self::SiblingModule | Self::ParentModule | Self::ChildModule => 0.3, @@ -105,6 +108,7 @@ impl RelationType { Self::ObserverPattern => "observer_pattern", Self::StrategyPattern => "strategy_pattern", Self::AdapterPattern => "adapter_pattern", + Self::References => "references", Self::SiblingModule => "sibling_module", Self::ParentModule => "parent_module", Self::ChildModule => "child_module", @@ -129,6 +133,7 @@ impl FromStr for RelationType { "observer_pattern" => Self::ObserverPattern, "strategy_pattern" => Self::StrategyPattern, "adapter_pattern" => Self::AdapterPattern, + "references" => Self::References, "sibling_module" => Self::SiblingModule, "parent_module" => Self::ParentModule, "child_module" => Self::ChildModule, @@ -264,6 +269,9 @@ mod tests { assert_eq!(RelationType::Uses.importance_weight(), 0.7); assert_eq!(RelationType::FactoryCreates.importance_weight(), 0.8); + // Document references (between structural and organizational) + assert_eq!(RelationType::References.importance_weight(), 0.6); + // Low importance relationships assert_eq!(RelationType::SiblingModule.importance_weight(), 0.3); assert_eq!(RelationType::ParentModule.importance_weight(), 0.3); @@ -290,6 +298,16 @@ mod tests { RelationType::Calls.importance_weight() > RelationType::ParentModule.importance_weight() ); + + // Verify references sit between structural imports and organizational + assert!( + RelationType::Imports.importance_weight() + > RelationType::References.importance_weight() + ); + assert!( + RelationType::References.importance_weight() + > RelationType::SiblingModule.importance_weight() + ); } #[test] @@ -315,6 +333,10 @@ mod tests { "sibling_module".parse::().unwrap(), RelationType::SiblingModule ); + assert_eq!( + "references".parse::().unwrap(), + RelationType::References + ); // Test unknown type defaults to Imports assert_eq!( @@ -343,6 +365,7 @@ mod tests { RelationType::Imports, RelationType::Calls, RelationType::Uses, + RelationType::References, RelationType::SiblingModule, ]; diff --git a/src/indexer/languages/markdown.rs b/src/indexer/languages/markdown.rs index 8a100f5..c8e3b5c 100644 --- a/src/indexer/languages/markdown.rs +++ b/src/indexer/languages/markdown.rs @@ -65,23 +65,72 @@ impl Language for Markdown { "markdown headings" } - // Markdown doesn't have traditional imports/exports - #[allow(dead_code)] - fn extract_imports_exports(&self, _node: Node, _contents: &str) -> (Vec, Vec) { - // Markdown files don't have imports or exports in the traditional sense - // Could potentially extract links to other markdown files, but that's not - // the same as code imports/exports - (Vec::new(), Vec::new()) + fn extract_imports_exports(&self, node: Node, contents: &str) -> (Vec, Vec) { + // Only extract at the root node to avoid redundant passes during + // the recursive AST walk (the JSON placeholder parser produces + // multiple nodes, each receiving the same contents string). + if node.parent().is_some() { + return (Vec::new(), Vec::new()); + } + + let mut links = Vec::new(); + // Match [text](path.md) — standard markdown links to .md files + // Skip external URLs (http:// or https://) + // Strip anchor fragments (#section) + let link_re = + regex::Regex::new(r"\[[^\]]*\]\(([^)]+\.md)(?:#[^)]*)?\)").unwrap(); + for cap in link_re.captures_iter(contents) { + if let Some(target) = cap.get(1) { + let path = target.as_str(); + if !path.starts_with("http://") && !path.starts_with("https://") { + links.push(path.to_string()); + } + } + } + + // Deduplicate — a doc may link to the same target multiple times + links.sort(); + links.dedup(); + + // Markdown "exports" are section headings + let exports = self.extract_symbols(node, contents); + (links, exports) } fn resolve_import( &self, - _import_path: &str, - _source_file: &str, - _all_files: &[String], + import_path: &str, + source_file: &str, + all_files: &[String], ) -> Option { - // Markdown doesn't have imports - None + use std::path::{Component, PathBuf}; + + let source_dir = PathBuf::from(source_file) + .parent() + .map(|p| p.to_path_buf()) + .unwrap_or_default(); + let joined = source_dir.join(import_path); + + // Normalize path components (resolve ../ and ./) + let normalized = + joined + .components() + .fold(PathBuf::new(), |mut acc, c| { + match c { + Component::ParentDir => { + acc.pop(); + } + Component::CurDir => {} + Component::Normal(os) => { + acc.push(os); + } + _ => {} + } + acc + }); + + let normalized_str = normalized.to_string_lossy().to_string(); + all_files.iter().find(|f| **f == normalized_str).cloned() } fn get_file_extensions(&self) -> Vec<&'static str> { @@ -90,3 +139,123 @@ impl Language for Markdown { } impl Markdown {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_markdown_links() { + let content = r#" +# Credit Accounts + +See [Credit Suite](../core-architecture/credit-suite.md) for details. +Also check [Pool](../core-architecture/pool.md#liquidity) and +[Adapters](./adapters.md). + +External links are ignored: [Docs](https://docs.example.com/guide.md) +Non-md links are ignored: [Image](./photo.png) +"#; + + // Test the regex logic directly (can't easily create tree-sitter Node in unit tests) + let link_re = + regex::Regex::new(r"\[[^\]]*\]\(([^)]+\.md)(?:#[^)]*)?\)").unwrap(); + let mut links = Vec::new(); + for cap in link_re.captures_iter(content) { + if let Some(target) = cap.get(1) { + let path = target.as_str(); + if !path.starts_with("http://") && !path.starts_with("https://") { + links.push(path.to_string()); + } + } + } + + assert_eq!(links.len(), 3); + assert!(links.contains(&"../core-architecture/credit-suite.md".to_string())); + assert!(links.contains(&"../core-architecture/pool.md".to_string())); + assert!(links.contains(&"./adapters.md".to_string())); + assert!(!links.iter().any(|l| l.contains("https://"))); + assert!(!links.iter().any(|l| l.contains(".png"))); + } + + #[test] + fn test_resolve_markdown_import() { + let md = Markdown; + let all_files = vec![ + "core-architecture/credit-suite.md".to_string(), + "core-architecture/pool.md".to_string(), + "introduction/adapters.md".to_string(), + ]; + + // Relative link: introduction/credit-accounts.md → ../core-architecture/credit-suite.md + let resolved = md.resolve_import( + "../core-architecture/credit-suite.md", + "introduction/credit-accounts.md", + &all_files, + ); + assert_eq!( + resolved, + Some("core-architecture/credit-suite.md".to_string()) + ); + + // Same-directory link + let resolved = md.resolve_import( + "./adapters.md", + "introduction/credit-accounts.md", + &all_files, + ); + assert_eq!(resolved, Some("introduction/adapters.md".to_string())); + + // Non-existent target + let resolved = md.resolve_import( + "../nonexistent.md", + "introduction/credit-accounts.md", + &all_files, + ); + assert_eq!(resolved, None); + } + + #[test] + fn test_resolve_with_deep_project_paths() { + let md = Markdown; + // Real-world paths from ai-assistant repo + let all_files = vec![ + "projects/gearbox/autodocs-about/docs/core-architecture/credit-suite.md".to_string(), + "projects/gearbox/autodocs-about/docs/core-architecture/pool.md".to_string(), + "projects/gearbox/autodocs-about/docs/introduction/credit-accounts.md".to_string(), + ]; + + // adapters-integrations.md links to credit-suite.md (same dir) + let resolved = md.resolve_import( + "credit-suite.md", + "projects/gearbox/autodocs-about/docs/core-architecture/adapters-integrations.md", + &all_files, + ); + assert_eq!( + resolved, + Some("projects/gearbox/autodocs-about/docs/core-architecture/credit-suite.md".to_string()), + "Same-dir link should resolve" + ); + + // adapters-integrations.md links to ../introduction/credit-accounts.md + let resolved = md.resolve_import( + "../introduction/credit-accounts.md", + "projects/gearbox/autodocs-about/docs/core-architecture/adapters-integrations.md", + &all_files, + ); + assert_eq!( + resolved, + Some("projects/gearbox/autodocs-about/docs/introduction/credit-accounts.md".to_string()), + "Parent-dir link should resolve" + ); + } + + #[test] + fn test_no_links_in_empty_doc() { + let link_re = + regex::Regex::new(r"\[[^\]]*\]\(([^)]+\.md)(?:#[^)]*)?\)").unwrap(); + let content = "# Simple heading\nNo links here."; + let links: Vec<_> = link_re.captures_iter(content).collect(); + assert!(links.is_empty()); + } +} diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 507a309..488837a 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -42,6 +42,7 @@ use crate::config::Config; use crate::mcp::logging::{log_file_processing_error, log_indexing_progress}; use crate::state; use crate::state::SharedState; +use crate::store::CodeBlock; #[cfg(test)] use crate::store::DocumentBlock; use crate::store::Store; @@ -848,6 +849,24 @@ pub async fn index_files_with_quiet( state.clone(), ) .await?; + + // Also create a synthetic CodeBlock so GraphBuilder includes + // this file in the knowledge graph. The builder reads the file + // from disk and calls Markdown::extract_imports_exports for + // actual link extraction. + if config.graphrag.enabled { + all_code_blocks.push(CodeBlock { + path: file_path.to_string(), + language: "markdown".to_string(), + content: String::new(), + symbols: vec![], + start_line: 0, + end_line: contents.lines().count(), + hash: crate::embedding::calculate_content_hash(&contents), + distance: None, + }); + } + file_processed = true; } else { // Handle code files - index as semantic code blocks only @@ -1066,6 +1085,21 @@ pub async fn index_files_with_quiet( state.clone(), ) .await?; + + // Also create a synthetic CodeBlock for GraphRAG + if config.graphrag.enabled { + all_code_blocks.push(CodeBlock { + path: file_path.to_string(), + language: "markdown".to_string(), + content: String::new(), + symbols: vec![], + start_line: 0, + end_line: contents.lines().count(), + hash: crate::embedding::calculate_content_hash(&contents), + distance: None, + }); + } + file_processed = true; } else { // Handle code files - index as semantic code blocks only @@ -1423,6 +1457,26 @@ pub async fn handle_file_change(store: &Store, file_path: &str, config: &Config) ) .await?; } + + // Update GraphRAG for this markdown file + if config.graphrag.enabled { + let md_blocks = vec![CodeBlock { + path: relative_file_path.to_string(), + language: "markdown".to_string(), + content: String::new(), + symbols: vec![], + start_line: 0, + end_line: contents.lines().count(), + hash: crate::embedding::calculate_content_hash(&contents), + distance: None, + }]; + let graph_builder = + graphrag::GraphBuilder::new_with_quiet(config.clone(), true) + .await?; + graph_builder + .process_code_blocks(&md_blocks, Some(state.clone())) + .await?; + } } else { // Handle code files let mut code_blocks_batch = Vec::new(); diff --git a/src/store/graphrag.rs b/src/store/graphrag.rs index 1d06c54..5e4ddb7 100644 --- a/src/store/graphrag.rs +++ b/src/store/graphrag.rs @@ -444,32 +444,74 @@ impl<'a> GraphRagOperations<'a> { pub async fn get_all_code_blocks_for_graphrag(&self) -> Result> { let mut all_blocks = Vec::new(); - if !self.table_ops.table_exists("code_blocks").await? { - return Ok(all_blocks); + if self.table_ops.table_exists("code_blocks").await? { + let table = self.get_table("code_blocks").await?; + + // Get all code blocks in batches to avoid memory issues + let mut results = table.query().execute().await?; + + // Process all result batches + while let Some(batch) = results.try_next().await? { + if batch.num_rows() > 0 { + // Convert batch to CodeBlocks + let converter = + crate::store::batch_converter::BatchConverter::new(self.code_vector_dim); + let mut code_blocks = converter.batch_to_code_blocks(&batch, None)?; + all_blocks.append(&mut code_blocks); + + // Log progress for large datasets + if cfg!(debug_assertions) && all_blocks.len() % 1000 == 0 { + tracing::debug!( + "Loaded {} code blocks for GraphRAG processing...", + all_blocks.len() + ); + } + } + } } - let table = self.get_table("code_blocks").await?; + // Also include markdown files from document_blocks so they get GraphRAG + // nodes and cross-reference relationships. Document blocks are stored + // separately from code blocks, so without this, markdown files would be + // invisible to GraphRAG when rebuilding from existing database. + if self.table_ops.table_exists("document_blocks").await? { + let doc_table = self.get_table("document_blocks").await?; + let mut doc_results = doc_table.query().execute().await?; - // Get all code blocks in batches to avoid memory issues - let mut results = table.query().execute().await?; + let mut seen_md_paths = std::collections::HashSet::new(); + // Collect paths already covered by code blocks + for block in &all_blocks { + seen_md_paths.insert(block.path.clone()); + } - // Process all result batches - while let Some(batch) = results.try_next().await? { - if batch.num_rows() > 0 { - // Convert batch to CodeBlocks - let converter = - crate::store::batch_converter::BatchConverter::new(self.code_vector_dim); - let mut code_blocks = converter.batch_to_code_blocks(&batch, None)?; - all_blocks.append(&mut code_blocks); - - // Log progress for large datasets - if cfg!(debug_assertions) && all_blocks.len() % 1000 == 0 { - tracing::debug!( - "Loaded {} code blocks for GraphRAG processing...", - all_blocks.len() - ); + while let Some(batch) = doc_results.try_next().await? { + if batch.num_rows() > 0 { + let converter = + crate::store::batch_converter::BatchConverter::new(self.code_vector_dim); + let doc_blocks = converter.batch_to_document_blocks(&batch, None)?; + for doc in &doc_blocks { + if doc.path.ends_with(".md") && seen_md_paths.insert(doc.path.clone()) { + // Create a synthetic CodeBlock so GraphBuilder processes + // this markdown file for nodes and cross-references + all_blocks.push(crate::store::CodeBlock { + path: doc.path.clone(), + language: "markdown".to_string(), + content: String::new(), + symbols: vec![], + start_line: 0, + end_line: doc.end_line, + hash: doc.hash.clone(), + distance: None, + }); + } + } } } + + tracing::debug!( + "Added {} markdown files from document_blocks for GraphRAG", + seen_md_paths.len().saturating_sub(all_blocks.len()) + ); } Ok(all_blocks) @@ -919,9 +961,9 @@ impl<'a> GraphRagOperations<'a> { } else if all_batches.len() == 1 { Ok(all_batches.into_iter().next().unwrap()) } else { - // For simplicity, return the first batch - // In a production system, you might want to concatenate all batches - Ok(all_batches.into_iter().next().unwrap()) + // Concatenate all batches into one + let schema = all_batches[0].schema(); + Ok(arrow::compute::concat_batches(&schema, &all_batches)?) } }