diff --git a/06-DocumentLoader/14-BiorxivLoader.ipynb b/06-DocumentLoader/14-BiorxivLoader.ipynb new file mode 100644 index 000000000..ff5c12504 --- /dev/null +++ b/06-DocumentLoader/14-BiorxivLoader.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "635d8ebb", + "metadata": {}, + "source": [ + "# Biorxiv Loader\n", + "\n", + "- Author: [frimer](https://github.com/brian604)\n", + "- Design:\n", + "- Peer Review: \n", + "- This is a part of [LangChain Open Tutorial](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial)\n", + "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/06-DocumentLoader/14-medrxivLoader.ipynb) [![Open in GitHub](https://img.shields.io/badge/Open%20in%20GitHub-181717?style=flat-square&logo=github&logoColor=white)](https://github.com/LangChain-OpenTutorial/LangChain-OpenTutorial/blob/main/06-DocumentLoader/14-medrxivLoader.ipynb)\n", + "\n", + "## Overview\n", + "\n", + "This tutorial will introduce you to another archives of health-related and biological-related contents: **medRxiv** and **bioRxiv**, both of which\n", + "are operated by the Cold Spring Harbor Laboratory\n", + "\n", + "### Table of Contents\n", + "\n", + "- [Overview](#overview)\n", + "- [Environment Setup](#environment-setup)\n", + "- [Example Queries](#example-queries)\n", + "\n", + "### References\n", + "\n", + "- [medrxivr](https://github.com/ropensci/medrxivr)\n", + " - Access and search medRxiv and bioRxiv\n", + "- [Arxiv Langchain](https://python.langchain.com/docs/integrations/providers/arxiv/)\n", + "- [medrxiv-langchain](https://github.com/brian604/medrxiv-langchain)\n", + "----" + ] + }, + { + "cell_type": "markdown", + "id": "c6c7aba4", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "Set up the environment. You may refer to [Environment Setup](https://wikidocs.net/257836) for more details.\n", + "\n", + "**[Note]**\n", + "- `langchain-opentutorial` is a package that provides a set of easy-to-use environment setup, useful functions and utilities for tutorials. \n", + "- You can checkout the [`langchain-opentutorial`](https://github.com/LangChain-OpenTutorial/langchain-opentutorial-pypi) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "21943adb", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture --no-stderr\n", + "%pip install --upgrade langchain-opentutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f25ec196", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "# Install required packages\n", + "from langchain_opentutorial import package\n", + "\n", + "package.install(\n", + " [\n", + " \"langsmith\",\n", + " \"langchain\",\n", + " \"langchain_core\",\n", + " \"langchain-anthropic\",\n", + " \"langchain_community\",\n", + " \"langchain_text_splitters\",\n", + " \"langchain_openai\",\n", + " ],\n", + " verbose=False,\n", + " upgrade=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7f9065ea", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Environment variables have been set successfully.\n" + ] + } + ], + "source": [ + "# Set environment variables\n", + "from langchain_opentutorial import set_env\n", + "\n", + "set_env(\n", + " {\n", + " \"OPENAI_API_KEY\": \"\",\n", + " \"LANGCHAIN_API_KEY\": \"\",\n", + " \"LANGCHAIN_TRACING_V2\": \"true\",\n", + " \"LANGCHAIN_ENDPOINT\": \"https://api.smith.langchain.com\",\n", + " \"LANGCHAIN_PROJECT\": \"BiorxivLoader\", # Please set it the same as title\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "690a9ae0", + "metadata": {}, + "source": [ + "You can alternatively set API keys such as `OPENAI_API_KEY` in a `.env` file and load them.\n", + "\n", + "**[Note]** This is not necessary if you've already set the required API keys in previous steps." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4f99b5b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load API keys from .env file\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(override=True)" + ] + }, + { + "cell_type": "markdown", + "id": "2b2fc536", + "metadata": {}, + "source": [ + "## Example Queries\n", + "\n", + "In this step, we will test out few examples to see if the biorxiv loader works as expected so it has a potential to contribute to `langchain_community`\n", + "- We will test the date range from server \"biorxiv\" for the period from 2024-01-01 to 2024-02-17" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d527750a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Testing simple keyword search...\n", + "\n", + "Searching biorxiv with URL: https://api.biorxiv.org/details/biorxiv/2000-01-01/2025-02-19/0\n", + "Total results from API: 370698\n", + "Filtered results for query 'machine learning': 0\n", + "\n", + "Testing keyword search with date range...\n", + "\n", + "Searching biorxiv with URL: https://api.biorxiv.org/details/biorxiv/2024-01-01/2024-02-17/0\n", + "Total results from API: 7235\n", + "Filtered results for query 'CRISPR': 2\n", + "\n", + "Searching medrxiv with URL: https://api.biorxiv.org/details/medrxiv/2024-01-01/2024-02-17/0\n", + "Total results from API: 1792\n", + "Filtered results for query 'CRISPR': 1\n", + "\n", + "Testing last 30 days search...\n", + "\n", + "Searching medrxiv with URL: https://api.biorxiv.org/details/medrxiv/2025-01-20/2025-02-19/0\n", + "Total results from API: 1306\n", + "Filtered results for query 'sequencing': 8\n", + "\n", + "Summary Statistics\n", + "-----------------\n", + "Total unique papers: 8\n", + "\n", + "Papers by Category:\n", + "genetic and genomic medicine: 4\n", + "developmental biology: 1\n", + "cell biology: 1\n", + "infectious diseases: 1\n", + "rheumatology: 1\n", + "\n", + "Papers by Server:\n", + "biorxiv: 2\n", + "medrxiv: 6\n", + "\n", + "Sample of Retrieved Papers:\n", + "\n", + "1. Establishment of CRISPR/Cas9-based knock-in in a hemimetabolous insect: targeted gene tagging in the cricket Gryllus bimaculatus\n", + "Server: biorxiv\n", + "Date: 2024-01-23\n", + "DOI: 10.1101/2021.05.10.441399\n", + "\n", + "2. When less is more - A fast TurboID KI approach for high sensitivity endogenous interactome mapping\n", + "Server: biorxiv\n", + "Date: 2024-01-11\n", + "DOI: 10.1101/2021.11.19.469212\n", + "\n", + "3. Tcte1 knockout influence on energy chain transportation, apoptosis and spermatogenesis - implications for male infertility\n", + "Server: medrxiv\n", + "Date: 2024-02-16\n", + "DOI: 10.1101/2022.11.17.22282339\n" + ] + } + ], + "source": [ + "import requests\n", + "from datetime import datetime, timedelta\n", + "from typing import List, Dict, Optional\n", + "\n", + "class SimpleBioRxivSearch:\n", + " def __init__(self):\n", + " self.base_url = \"https://api.biorxiv.org/details\"\n", + " \n", + " def search(self, \n", + " query: str, \n", + " server: List[str] = [\"biorxiv\"],\n", + " start_date: Optional[str] = None,\n", + " end_date: Optional[str] = None,\n", + " max_results: int = 5) -> List[Dict]:\n", + " \"\"\"\n", + " Search bioRxiv and/or medRxiv papers.\n", + " \"\"\"\n", + " results = []\n", + " \n", + " for srv in server:\n", + " # Construct the API URL\n", + " if start_date and end_date:\n", + " url = f\"{self.base_url}/{srv}/{start_date}/{end_date}/0\"\n", + " else:\n", + " url = f\"{self.base_url}/{srv}/2000-01-01/{datetime.now().strftime('%Y-%m-%d')}/0\"\n", + " \n", + " try:\n", + " response = requests.get(url)\n", + " response.raise_for_status() # Raise an exception for bad status codes\n", + " data = response.json()\n", + " \n", + " # Print API response for debugging\n", + " print(f\"\\nSearching {srv} with URL: {url}\")\n", + " print(f\"Total results from API: {data.get('messages', [{}])[0].get('total', 0)}\")\n", + " \n", + " # Filter results based on query terms\n", + " query_terms = [term.lower() for term in query.replace('\"', '').split(' AND ')]\n", + " \n", + " filtered_results = []\n", + " for paper in data.get('collection', []):\n", + " text_to_search = (paper.get('title', '') + ' ' + paper.get('abstract', '')).lower()\n", + " if all(term in text_to_search for term in query_terms):\n", + " filtered_results.append({\n", + " 'title': paper.get('title', ''),\n", + " 'abstract': paper.get('abstract', ''),\n", + " 'doi': paper.get('doi', ''),\n", + " 'category': paper.get('category', 'Unknown'),\n", + " 'server': srv,\n", + " 'date': paper.get('date', '')\n", + " })\n", + " \n", + " print(f\"Filtered results for query '{query}': {len(filtered_results)}\")\n", + " results.extend(filtered_results[:max_results])\n", + " \n", + " except requests.exceptions.RequestException as e:\n", + " print(f\"Error accessing {srv} API: {str(e)}\")\n", + " continue\n", + " \n", + " return results[:max_results]\n", + "\n", + "# Usage example:\n", + "searcher = SimpleBioRxivSearch()\n", + "\n", + "# 1. Simple keyword search\n", + "print(\"\\nTesting simple keyword search...\")\n", + "docs1 = searcher.search(\n", + " query=\"machine learning\", # Simplified query\n", + " server=[\"biorxiv\"],\n", + " max_results=5\n", + ")\n", + "\n", + "# 2. Keyword search with date range\n", + "print(\"\\nTesting keyword search with date range...\")\n", + "docs2 = searcher.search(\n", + " query=\"CRISPR\", # Simplified query\n", + " server=[\"biorxiv\", \"medrxiv\"],\n", + " start_date=\"2024-01-01\",\n", + " end_date=\"2024-02-17\",\n", + " max_results=5\n", + ")\n", + "\n", + "# 3. Last 30 days search\n", + "print(\"\\nTesting last 30 days search...\")\n", + "end_date = datetime.now()\n", + "start_date = end_date - timedelta(days=30)\n", + "docs3 = searcher.search(\n", + " query=\"sequencing\", # Simplified query\n", + " server=[\"medrxiv\"],\n", + " start_date=start_date.strftime(\"%Y-%m-%d\"),\n", + " end_date=end_date.strftime(\"%Y-%m-%d\"),\n", + " max_results=5\n", + ")\n", + "\n", + "# Print summary statistics\n", + "all_docs = docs1 + docs2 + docs3\n", + "categories = {}\n", + "servers = {\"biorxiv\": 0, \"medrxiv\": 0}\n", + "\n", + "for doc in all_docs:\n", + " # Count by category\n", + " cat = doc['category']\n", + " categories[cat] = categories.get(cat, 0) + 1\n", + " \n", + " # Count by server\n", + " server = doc['server']\n", + " servers[server] += 1\n", + "\n", + "print(\"\\nSummary Statistics\")\n", + "print(\"-----------------\")\n", + "print(f\"Total unique papers: {len(all_docs)}\")\n", + "print(\"\\nPapers by Category:\")\n", + "for cat, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):\n", + " print(f\"{cat}: {count}\")\n", + "\n", + "print(\"\\nPapers by Server:\")\n", + "for server, count in servers.items():\n", + " print(f\"{server}: {count}\")\n", + "\n", + "# Print sample of results\n", + "print(\"\\nSample of Retrieved Papers:\")\n", + "for i, doc in enumerate(all_docs[:3], 1):\n", + " print(f\"\\n{i}. {doc['title']}\")\n", + " print(f\"Server: {doc['server']}\")\n", + " print(f\"Date: {doc['date']}\")\n", + " print(f\"DOI: {doc['doi']}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langchain-opentutorial-O2Q3XwPg-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}