-
Notifications
You must be signed in to change notification settings - Fork 493
Integrate crawl4ai tool #697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
40d642f
Craw4Ai integration WIP
rjambrecic c6bb112
Merge remote-tracking branch 'origin/main' into integrate-crawl4ai-tool
rjambrecic 565b56a
Craw4Ai integration WIP
rjambrecic d00b43d
Merge remote-tracking branch 'origin/main' into integrate-crawl4ai-tool
rjambrecic d57644a
Craw4Ai integration WIP
rjambrecic 117a49f
Craw4Ai integration WIP
rjambrecic 91cfe37
Merge remote-tracking branch 'origin/main' into integrate-crawl4ai-tool
rjambrecic a8c0fa6
Refactoring
rjambrecic 220c554
crawl4ai with LLMs WIP
rjambrecic 773d34a
Merge remote-tracking branch 'origin/main' into integrate-crawl4ai-tool
rjambrecic ec9d5f7
WIP
rjambrecic eb032cc
Refactoring
rjambrecic 5d357a2
Add crawling with extraction schema
rjambrecic 9395941
Refactoring
rjambrecic 3924eac
Refactor and add tests
rjambrecic 2cbafb0
Add initial crawl4ai notebook tutorial
rjambrecic ccd1cc5
export_module for browser-use and crawl4ai modules
rjambrecic b660a43
Fix tests
rjambrecic 9733794
Cleanup
rjambrecic File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| # Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| from .crawl4ai import Crawl4AITool | ||
|
|
||
| __all__ = ["Crawl4AITool"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,173 @@ | ||
| # Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import os | ||
| from typing import Annotated, Any, Optional, Type | ||
|
|
||
| from pydantic import BaseModel | ||
|
|
||
| from ....doc_utils import export_module | ||
| from ....import_utils import optional_import_block, require_optional_import | ||
| from ... import Tool | ||
| from ...dependency_injection import Depends, on | ||
|
|
||
| with optional_import_block(): | ||
| from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig | ||
| from crawl4ai.extraction_strategy import LLMExtractionStrategy | ||
|
|
||
| __all__ = ["Crawl4AITool"] | ||
|
|
||
|
|
||
| @require_optional_import(["crawl4ai"], "crawl4ai") | ||
| @export_module("autogen.tools.experimental") | ||
| class Crawl4AITool(Tool): | ||
| """ | ||
| Crawl a website and extract information using the crawl4ai library. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| llm_config: Optional[dict[str, Any]] = None, | ||
| extraction_model: Optional[Type[BaseModel]] = None, | ||
| llm_strategy_kwargs: Optional[dict[str, Any]] = None, | ||
| ) -> None: | ||
| """ | ||
| Initialize the Crawl4AITool. | ||
|
|
||
| Args: | ||
| llm_config: The config dictionary for the LLM model. If None, the tool will run without LLM. | ||
| extraction_model: The Pydantic model to use for extraction. If None, the tool will use the default schema. | ||
| llm_strategy_kwargs: The keyword arguments to pass to the LLM extraction strategy. | ||
| """ | ||
| Crawl4AITool._validate_llm_strategy_kwargs(llm_strategy_kwargs, llm_config_provided=(llm_config is not None)) | ||
|
|
||
| async def crawl4ai_helper( # type: ignore[no-any-unimported] | ||
| url: str, | ||
| browser_cfg: Optional["BrowserConfig"] = None, | ||
| crawl_config: Optional["CrawlerRunConfig"] = None, | ||
| ) -> Any: | ||
| async with AsyncWebCrawler(config=browser_cfg) as crawler: | ||
| result = await crawler.arun( | ||
| url=url, | ||
| config=crawl_config, | ||
| ) | ||
|
|
||
| if crawl_config is None: | ||
| response = result.markdown | ||
| else: | ||
| response = result.extracted_content if result.success else result.error_message | ||
|
|
||
| return response | ||
|
|
||
| async def crawl4ai_without_llm( | ||
| url: Annotated[str, "The url to crawl and extract information from."], | ||
| ) -> Any: | ||
| return await crawl4ai_helper(url=url) | ||
|
|
||
| async def crawl4ai_with_llm( | ||
| url: Annotated[str, "The url to crawl and extract information from."], | ||
| instruction: Annotated[str, "The instruction to provide on how and what to extract."], | ||
| llm_config: Annotated[dict[str, Any], Depends(on(llm_config))], | ||
| llm_strategy_kwargs: Annotated[Optional[dict[str, Any]], Depends(on(llm_strategy_kwargs))], | ||
| extraction_model: Annotated[Optional[Type[BaseModel]], Depends(on(extraction_model))], | ||
| ) -> Any: | ||
| browser_cfg = BrowserConfig(headless=True) | ||
| crawl_config = Crawl4AITool._get_crawl_config( | ||
| llm_config=llm_config, | ||
| instruction=instruction, | ||
| extraction_model=extraction_model, | ||
| llm_strategy_kwargs=llm_strategy_kwargs, | ||
| ) | ||
|
|
||
| return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config) | ||
|
|
||
| super().__init__( | ||
| name="crawl4ai", | ||
| description="Crawl a website and extract information.", | ||
| func_or_tool=crawl4ai_without_llm if llm_config is None else crawl4ai_with_llm, | ||
| ) | ||
|
|
||
| @staticmethod | ||
| def _validate_llm_strategy_kwargs(llm_strategy_kwargs: Optional[dict[str, Any]], llm_config_provided: bool) -> None: | ||
| if not llm_strategy_kwargs: | ||
| return | ||
|
|
||
| if not llm_config_provided: | ||
| raise ValueError("llm_strategy_kwargs can only be provided if llm_config is also provided.") | ||
|
|
||
| check_parameters_error_msg = "".join( | ||
| f"'{key}' should not be provided in llm_strategy_kwargs. It is automatically set based on llm_config.\n" | ||
| for key in ["provider", "api_token"] | ||
| if key in llm_strategy_kwargs | ||
| ) | ||
|
|
||
| check_parameters_error_msg += "".join( | ||
| "'schema' should not be provided in llm_strategy_kwargs. It is automatically set based on extraction_model type.\n" | ||
| if "schema" in llm_strategy_kwargs | ||
| else "" | ||
| ) | ||
|
|
||
| check_parameters_error_msg += "".join( | ||
| "'instruction' should not be provided in llm_strategy_kwargs. It is provided at the time of calling the tool.\n" | ||
| if "instruction" in llm_strategy_kwargs | ||
| else "" | ||
| ) | ||
|
|
||
| if check_parameters_error_msg: | ||
| raise ValueError(check_parameters_error_msg) | ||
|
|
||
| @staticmethod | ||
| def _get_provider_and_api_key(llm_config: dict[str, Any]) -> tuple[str, str]: | ||
| if "config_list" not in llm_config: | ||
| if "model" in llm_config: | ||
| model = llm_config["model"] | ||
| api_type = "openai" | ||
| api_key = os.getenv("OPENAI_API_KEY") | ||
| raise ValueError("llm_config must be a valid config dictionary.") | ||
| else: | ||
| try: | ||
| model = llm_config["config_list"][0]["model"] | ||
| api_type = llm_config["config_list"][0].get("api_type", "openai") | ||
| api_key = llm_config["config_list"][0]["api_key"] | ||
|
|
||
| except (KeyError, TypeError): | ||
| raise ValueError("llm_config must be a valid config dictionary.") | ||
|
|
||
| provider = f"{api_type}/{model}" | ||
| return provider, api_key # type: ignore[return-value] | ||
|
|
||
| @staticmethod | ||
| def _get_crawl_config( # type: ignore[no-any-unimported] | ||
| llm_config: dict[str, Any], | ||
| instruction: str, | ||
| llm_strategy_kwargs: Optional[dict[str, Any]] = None, | ||
| extraction_model: Optional[Type[BaseModel]] = None, | ||
| ) -> "CrawlerRunConfig": | ||
| provider, api_key = Crawl4AITool._get_provider_and_api_key(llm_config) | ||
|
|
||
| if llm_strategy_kwargs is None: | ||
| llm_strategy_kwargs = {} | ||
|
|
||
| schema = ( | ||
| extraction_model.model_json_schema() | ||
| if (extraction_model and issubclass(extraction_model, BaseModel)) | ||
| else None | ||
| ) | ||
|
|
||
| extraction_type = llm_strategy_kwargs.pop("extraction_type", "schema" if schema else "block") | ||
|
|
||
| # 1. Define the LLM extraction strategy | ||
| llm_strategy = LLMExtractionStrategy( | ||
| provider=provider, | ||
| api_token=api_key, | ||
| schema=schema, | ||
| extraction_type=extraction_type, | ||
| instruction=instruction, | ||
| **llm_strategy_kwargs, | ||
| ) | ||
|
|
||
| # 2. Build the crawler config | ||
| crawl_config = CrawlerRunConfig(extraction_strategy=llm_strategy, cache_mode=CacheMode.BYPASS) | ||
|
|
||
| return crawl_config | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.