From 6806109fd444ed8650a9c4c7efe838a536fac6db Mon Sep 17 00:00:00 2001 From: Win Cheng Date: Mon, 14 Jul 2025 10:40:55 -0700 Subject: [PATCH 1/3] removed proxy from response --- jigsawstack/web.py | 21 ++- tests/test_scrape.py | 359 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 377 insertions(+), 3 deletions(-) create mode 100644 tests/test_scrape.py diff --git a/jigsawstack/web.py b/jigsawstack/web.py index 82aa2b8..337fa2a 100644 --- a/jigsawstack/web.py +++ b/jigsawstack/web.py @@ -93,12 +93,18 @@ class WaitFor(TypedDict): value: Union[str, int] -class AdvanceConfig(TypedDict): +class AdvanceConfigRequest(TypedDict): console: bool network: bool cookies: bool +class AdvanceConfigResponse(TypedDict): + console: list + network: list + cookies: list + + class BYOProxyAuth(TypedDict): username: str password: str @@ -117,7 +123,7 @@ class BaseAIScrapeParams(TypedDict): reject_request_pattern: NotRequired[List[str]] goto_options: NotRequired[GotoOptions] wait_for: NotRequired[WaitFor] - advance_config: NotRequired[AdvanceConfig] + advance_config: NotRequired[AdvanceConfigRequest] size_preset: NotRequired[str] is_mobile: NotRequired[bool] scale: NotRequired[int] @@ -164,13 +170,22 @@ class Link(TypedDict): type: Literal["a", "img"] +class Meta(TypedDict): + title: Optional[str] + description: Optional[str] + keywords: Optional[str] + og_image: Optional[str] + + class AIScrapeResponse(TypedDict): success: bool data: List[DataItem] page_position: int page_position_length: int - context: Dict[str, List[str]] + advance_config: Optional[AdvanceConfigResponse] + context: Any selectors: Dict[str, List[str]] + meta: Optional[Meta] link: List[Link] diff --git a/tests/test_scrape.py b/tests/test_scrape.py new file mode 100644 index 0000000..7a8a9fe --- /dev/null +++ b/tests/test_scrape.py @@ -0,0 +1,359 @@ +from unittest.mock import MagicMock +import unittest +from jigsawstack.exceptions import JigsawStackError +from jigsawstack import JigsawStack, AsyncJigsawStack +import pytest +import asyncio +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# Synchronous AI Scrape Tests +def test_ai_scrape_with_selectors(): + """Test AI scrape with CSS selectors""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://news.ycombinator.com/news", + "selectors": [".titles", ".points"], + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape with selectors test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +def test_ai_scrape_with_element_prompts(): + """Test AI scrape with element prompts""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://news.ycombinator.com/news", + "element_prompts": ["titles", "points"], + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape with element prompts test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +def test_ai_scrape_with_selectors_and_prompts(): + """Test AI scrape with both selectors and element prompts""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://news.ycombinator.com/news", + "selectors": [".titles", ".points"], + "element_prompts": ["titles", "points"], + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape with selectors and prompts test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +def test_ai_scrape_with_advanced_config(): + """Test AI scrape with advanced configuration options""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://news.ycombinator.com/news", + "selectors": [".titles", ".points"], + "root_element_selector": "main", + "page_position": 0, + "http_headers": {"User-Agent": "JigsawStack-Test/1.0"}, + "goto_options": {"timeout": 30000, "wait_until": "domcontentloaded"}, + "wait_for": {"mode": "selector", "value": ".content"}, + "advance_config": {"console": True, "network": False, "cookies": True}, + "is_mobile": False, + "scale": 1, + "width": 1920, + "height": 1080, + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape with advanced config test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +def test_ai_scrape_with_cookies(): + """Test AI scrape with custom cookies""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://example.com", + "selectors": [".user-content"], + "cookies": [ + { + "name": "session_id", + "value": "abc123", + "domain": "example.com", + "path": "/", + "secure": True, + "httpOnly": True, + "sameSite": "Strict", + } + ], + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape with cookies test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +def test_ai_scrape_with_proxy(): + """Test AI scrape with BYO proxy configuration""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://example.com", + "element_prompts": ["Extract main content"], + "force_rotate_proxy": True, + "byo_proxy": { + "server": "proxy.example.com:8080", + "auth": {"username": "proxy_user", "password": "proxy_pass"}, + }, + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape with proxy test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +def test_ai_scrape_mobile_preset(): + """Test AI scrape with mobile size preset""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://example.com", + "selectors": [".mobile-content"], + "size_preset": "mobile", + "is_mobile": True, + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape mobile preset test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +def test_ai_scrape_with_request_filtering(): + """Test AI scrape with request pattern rejection""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + { + "url": "https://example.com", + "element_prompts": ["Get main article text"], + "reject_request_pattern": [ + ".*\\.js$", + ".*\\.css$", + ".*analytics.*", + ".*ads.*", + ], + } + ) + assert result["success"] == True + assert "data" in result + logger.info("AI scrape with request filtering test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + +# Asynchronous AI Scrape Tests +def test_async_ai_scrape_with_selectors(): + """Test async AI scrape with CSS selectors""" + + async def _test(): + client = AsyncJigsawStack() + try: + result = await client.web.ai_scrape( + { + "url": "https://example.com", + "selectors": [".title", ".price", ".description"], + } + ) + assert result["success"] == True + assert "data" in result + logger.info("Async AI scrape with selectors test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + asyncio.run(_test()) + + +def test_async_ai_scrape_with_element_prompts(): + """Test async AI scrape with element prompts""" + + async def _test(): + client = AsyncJigsawStack() + try: + result = await client.web.ai_scrape( + { + "url": "https://example.com", + "element_prompts": [ + "Find the product title", + "Extract the price", + "Get the product description", + ], + } + ) + assert result["success"] == True + assert "data" in result + logger.info("Async AI scrape with element prompts test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + asyncio.run(_test()) + + +def test_async_ai_scrape_with_timeout_config(): + """Test async AI scrape with timeout and wait configurations""" + + async def _test(): + client = AsyncJigsawStack() + try: + result = await client.web.ai_scrape( + { + "url": "https://example.com", + "selectors": [".dynamic-content"], + "goto_options": {"timeout": 60000, "wait_until": "networkidle2"}, + "wait_for": {"mode": "timeout", "value": 3000}, + } + ) + assert result["success"] == True + assert "data" in result + logger.info("Async AI scrape with timeout config test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + asyncio.run(_test()) + + +def test_async_ai_scrape_comprehensive(): + """Test async AI scrape with comprehensive configuration""" + + async def _test(): + client = AsyncJigsawStack() + try: + result = await client.web.ai_scrape( + { + "url": "https://example.com", + "selectors": [".product-title", ".price", ".availability"], + "element_prompts": [ + "Extract product information", + "Get pricing details", + ], + "root_element_selector": ".product-container", + "page_position": 1, + "http_headers": { + "User-Agent": "JigsawStack-AsyncTest/1.0", + "Accept-Language": "en-US,en;q=0.9", + }, + "goto_options": {"timeout": 45000, "wait_until": "load"}, + "wait_for": {"mode": "selector", "value": ".product-container"}, + "advance_config": { + "console": False, + "network": True, + "cookies": True, + }, + "size_preset": "desktop", + "is_mobile": False, + "scale": 1, + "width": 1366, + "height": 768, + "force_rotate_proxy": False, + "reject_request_pattern": [".*\\.gif$", ".*tracking.*"], + } + ) + assert result["success"] == True + assert "data" in result + assert "meta" in result + assert "link" in result + assert "selectors" in result + logger.info("Async comprehensive AI scrape test passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") + + asyncio.run(_test()) + + +# Error Handling Tests +def test_ai_scrape_missing_required_params(): + """Test AI scrape error handling for missing required parameters""" + client = JigsawStack() + try: + # Missing both selectors and element_prompts + result = client.web.ai_scrape({"url": "https://example.com"}) + # Should still work as the API might have defaults + logger.info("AI scrape with minimal params completed") + except JigsawStackError as e: + # Expected error for insufficient parameters + logger.info(f"Expected error for missing params: {e}") + + +def test_async_ai_scrape_invalid_url(): + """Test async AI scrape error handling for invalid URL""" + + async def _test(): + client = AsyncJigsawStack() + try: + result = await client.web.ai_scrape( + {"url": "invalid-url", "selectors": [".content"]} + ) + # Might succeed depending on API behavior + logger.info("AI scrape with invalid URL completed") + except JigsawStackError as e: + # Expected error for invalid URL + logger.info(f"Expected error for invalid URL: {e}") + + asyncio.run(_test()) + + +# Response Structure Validation Tests +def test_ai_scrape_response_structure(): + """Test that AI scrape response has expected structure""" + client = JigsawStack() + try: + result = client.web.ai_scrape( + {"url": "https://example.com", "selectors": [".title"]} + ) + + # Validate response structure + assert "success" in result + assert "data" in result + assert isinstance(result["data"], list) + + # Check for optional fields + if "meta" in result: + assert isinstance(result["meta"], (dict, type(None))) + if "link" in result: + assert isinstance(result["link"], list) + if "selectors" in result: + assert isinstance(result["selectors"], dict) + + logger.info("AI scrape response structure validation passed") + except JigsawStackError as e: + pytest.fail(f"Unexpected JigsawStackError: {e}") From 46107ff041c81a66ab39d02f806c48054b3a4e73 Mon Sep 17 00:00:00 2001 From: Khurdhula-Harshavardhan Date: Mon, 14 Jul 2025 11:17:02 -0700 Subject: [PATCH 2/3] fix: del test file. --- tests/test_scrape.py | 359 ------------------------------------------- 1 file changed, 359 deletions(-) delete mode 100644 tests/test_scrape.py diff --git a/tests/test_scrape.py b/tests/test_scrape.py deleted file mode 100644 index 7a8a9fe..0000000 --- a/tests/test_scrape.py +++ /dev/null @@ -1,359 +0,0 @@ -from unittest.mock import MagicMock -import unittest -from jigsawstack.exceptions import JigsawStackError -from jigsawstack import JigsawStack, AsyncJigsawStack -import pytest -import asyncio -import logging - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -# Synchronous AI Scrape Tests -def test_ai_scrape_with_selectors(): - """Test AI scrape with CSS selectors""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://news.ycombinator.com/news", - "selectors": [".titles", ".points"], - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape with selectors test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -def test_ai_scrape_with_element_prompts(): - """Test AI scrape with element prompts""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://news.ycombinator.com/news", - "element_prompts": ["titles", "points"], - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape with element prompts test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -def test_ai_scrape_with_selectors_and_prompts(): - """Test AI scrape with both selectors and element prompts""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://news.ycombinator.com/news", - "selectors": [".titles", ".points"], - "element_prompts": ["titles", "points"], - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape with selectors and prompts test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -def test_ai_scrape_with_advanced_config(): - """Test AI scrape with advanced configuration options""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://news.ycombinator.com/news", - "selectors": [".titles", ".points"], - "root_element_selector": "main", - "page_position": 0, - "http_headers": {"User-Agent": "JigsawStack-Test/1.0"}, - "goto_options": {"timeout": 30000, "wait_until": "domcontentloaded"}, - "wait_for": {"mode": "selector", "value": ".content"}, - "advance_config": {"console": True, "network": False, "cookies": True}, - "is_mobile": False, - "scale": 1, - "width": 1920, - "height": 1080, - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape with advanced config test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -def test_ai_scrape_with_cookies(): - """Test AI scrape with custom cookies""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://example.com", - "selectors": [".user-content"], - "cookies": [ - { - "name": "session_id", - "value": "abc123", - "domain": "example.com", - "path": "/", - "secure": True, - "httpOnly": True, - "sameSite": "Strict", - } - ], - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape with cookies test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -def test_ai_scrape_with_proxy(): - """Test AI scrape with BYO proxy configuration""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://example.com", - "element_prompts": ["Extract main content"], - "force_rotate_proxy": True, - "byo_proxy": { - "server": "proxy.example.com:8080", - "auth": {"username": "proxy_user", "password": "proxy_pass"}, - }, - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape with proxy test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -def test_ai_scrape_mobile_preset(): - """Test AI scrape with mobile size preset""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://example.com", - "selectors": [".mobile-content"], - "size_preset": "mobile", - "is_mobile": True, - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape mobile preset test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -def test_ai_scrape_with_request_filtering(): - """Test AI scrape with request pattern rejection""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - { - "url": "https://example.com", - "element_prompts": ["Get main article text"], - "reject_request_pattern": [ - ".*\\.js$", - ".*\\.css$", - ".*analytics.*", - ".*ads.*", - ], - } - ) - assert result["success"] == True - assert "data" in result - logger.info("AI scrape with request filtering test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - -# Asynchronous AI Scrape Tests -def test_async_ai_scrape_with_selectors(): - """Test async AI scrape with CSS selectors""" - - async def _test(): - client = AsyncJigsawStack() - try: - result = await client.web.ai_scrape( - { - "url": "https://example.com", - "selectors": [".title", ".price", ".description"], - } - ) - assert result["success"] == True - assert "data" in result - logger.info("Async AI scrape with selectors test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - asyncio.run(_test()) - - -def test_async_ai_scrape_with_element_prompts(): - """Test async AI scrape with element prompts""" - - async def _test(): - client = AsyncJigsawStack() - try: - result = await client.web.ai_scrape( - { - "url": "https://example.com", - "element_prompts": [ - "Find the product title", - "Extract the price", - "Get the product description", - ], - } - ) - assert result["success"] == True - assert "data" in result - logger.info("Async AI scrape with element prompts test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - asyncio.run(_test()) - - -def test_async_ai_scrape_with_timeout_config(): - """Test async AI scrape with timeout and wait configurations""" - - async def _test(): - client = AsyncJigsawStack() - try: - result = await client.web.ai_scrape( - { - "url": "https://example.com", - "selectors": [".dynamic-content"], - "goto_options": {"timeout": 60000, "wait_until": "networkidle2"}, - "wait_for": {"mode": "timeout", "value": 3000}, - } - ) - assert result["success"] == True - assert "data" in result - logger.info("Async AI scrape with timeout config test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - asyncio.run(_test()) - - -def test_async_ai_scrape_comprehensive(): - """Test async AI scrape with comprehensive configuration""" - - async def _test(): - client = AsyncJigsawStack() - try: - result = await client.web.ai_scrape( - { - "url": "https://example.com", - "selectors": [".product-title", ".price", ".availability"], - "element_prompts": [ - "Extract product information", - "Get pricing details", - ], - "root_element_selector": ".product-container", - "page_position": 1, - "http_headers": { - "User-Agent": "JigsawStack-AsyncTest/1.0", - "Accept-Language": "en-US,en;q=0.9", - }, - "goto_options": {"timeout": 45000, "wait_until": "load"}, - "wait_for": {"mode": "selector", "value": ".product-container"}, - "advance_config": { - "console": False, - "network": True, - "cookies": True, - }, - "size_preset": "desktop", - "is_mobile": False, - "scale": 1, - "width": 1366, - "height": 768, - "force_rotate_proxy": False, - "reject_request_pattern": [".*\\.gif$", ".*tracking.*"], - } - ) - assert result["success"] == True - assert "data" in result - assert "meta" in result - assert "link" in result - assert "selectors" in result - logger.info("Async comprehensive AI scrape test passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") - - asyncio.run(_test()) - - -# Error Handling Tests -def test_ai_scrape_missing_required_params(): - """Test AI scrape error handling for missing required parameters""" - client = JigsawStack() - try: - # Missing both selectors and element_prompts - result = client.web.ai_scrape({"url": "https://example.com"}) - # Should still work as the API might have defaults - logger.info("AI scrape with minimal params completed") - except JigsawStackError as e: - # Expected error for insufficient parameters - logger.info(f"Expected error for missing params: {e}") - - -def test_async_ai_scrape_invalid_url(): - """Test async AI scrape error handling for invalid URL""" - - async def _test(): - client = AsyncJigsawStack() - try: - result = await client.web.ai_scrape( - {"url": "invalid-url", "selectors": [".content"]} - ) - # Might succeed depending on API behavior - logger.info("AI scrape with invalid URL completed") - except JigsawStackError as e: - # Expected error for invalid URL - logger.info(f"Expected error for invalid URL: {e}") - - asyncio.run(_test()) - - -# Response Structure Validation Tests -def test_ai_scrape_response_structure(): - """Test that AI scrape response has expected structure""" - client = JigsawStack() - try: - result = client.web.ai_scrape( - {"url": "https://example.com", "selectors": [".title"]} - ) - - # Validate response structure - assert "success" in result - assert "data" in result - assert isinstance(result["data"], list) - - # Check for optional fields - if "meta" in result: - assert isinstance(result["meta"], (dict, type(None))) - if "link" in result: - assert isinstance(result["link"], list) - if "selectors" in result: - assert isinstance(result["selectors"], dict) - - logger.info("AI scrape response structure validation passed") - except JigsawStackError as e: - pytest.fail(f"Unexpected JigsawStackError: {e}") From b049fda5f64d15ccb12027afd1e4404453cd8062 Mon Sep 17 00:00:00 2001 From: Win Cheng Date: Mon, 14 Jul 2025 11:22:21 -0700 Subject: [PATCH 3/3] update vers --- jigsawstack/version.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jigsawstack/version.py b/jigsawstack/version.py index 61c51fd..e693204 100644 --- a/jigsawstack/version.py +++ b/jigsawstack/version.py @@ -1,4 +1,4 @@ -__version__ = "0.2.8" +__version__ = "0.2.9" def get_version() -> str: diff --git a/setup.py b/setup.py index eeb244d..a3c40b1 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="jigsawstack", - version="0.2.8", + version="0.2.9", description="JigsawStack - The AI SDK for Python", long_description=open("README.md", encoding="utf8").read(), long_description_content_type="text/markdown",