-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetNewIOCs.py
More file actions
280 lines (225 loc) · 11.1 KB
/
getNewIOCs.py
File metadata and controls
280 lines (225 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
"""
Online inforamtion downloading
This module handles the downloading, processing, and management of IOC events from the CIRCL OSInt feed.
It provides functions to download individual events, perform concurrent downloads of multiple events, and
compare UUIDs between the CIRCL feed and a local SQLite database. The module also includes functions to
retrieve UUID lists from both the local database and the CIRCL manifest. Downloaded events are stored as
JSON files in a specified directory, and the module integrates with the IOC_DATABASE class for database operations.
Functions:
download_event(uuid, events_dir, session)
Downloads a single event based on its UUID and saves it as a JSON file.
downloadEvents(uuid_list, events_dir, max_workers=10)
Downloads multiple events concurrently using a thread pool.
findUniqueUUIDs(db_list, circl_list)
Returns the list of UUIDs present in the CIRCL feed but not in the local database.
getDB_UUIDs(file)
Retrieves the list of event UUIDs from the local SQLite database.
getCIRCL_UUIDs()
Downloads and returns the list of event UUIDs from the CIRCL manifest.
"""
import requests
import json
import os
from python_types.db import IOC_DATABASE
from concurrent.futures import ThreadPoolExecutor, as_completed
def download_event(uuid, events_dir, session):
"""
Download a single event JSON file from the CIRCL OSInt feed.
This function downloads an event specified by its UUID from the CIRCL OSInt feed URL.
It saves the event as a JSON file in the specified events directory. If the file already exists,
the download is skipped.
Args:
uuid (str): The unique identifier of the event.
events_dir (str): The directory where the event JSON file will be saved.
session (requests.Session): An active requests Session object for making HTTP requests.
Returns:
str: A message indicating whether the event was downloaded or skipped, or if the download failed.
"""
## TO DO : voir downloadEvents
filename = f'{uuid}.json'
file_path = os.path.join(events_dir, filename)
# Vérifie si le fichier existe déjà
if os.path.exists(file_path):
return f"Skipped (already exists): {filename}"
url = f'https://www.circl.lu/doc/misp/feed-osint/{filename}'
try:
with session.get(url, timeout=10) as response:
response.raise_for_status()
event_data = response.json()
with open(file_path, 'w') as f:
json.dump(event_data, f)
return f"Downloaded: {filename}"
except requests.RequestException as e:
return f"Failed to download {filename}: {e}"
def downloadEvents(uuid_list, events_dir, max_workers=10):
"""
Download multiple event JSON files concurrently.
This function downloads events specified in uuid_list concurrently using a ThreadPoolExecutor.
It ensures the events directory exists and handles retries for failed downloads with a reduced number
of workers.
Args:
uuid_list (list): A list of event UUIDs (as strings) to download.
events_dir (str): The directory where event JSON files will be stored.
max_workers (int, optional): The maximum number of concurrent worker threads. Defaults to 10.
Returns:
None
"""
##TO DO : refaire toute la focntion avec de l'async.
# Assure que le répertoire existe
os.makedirs(events_dir, exist_ok=True)
# Vérifie si la liste des UUID est vide
if not uuid_list:
print("UUID list is empty, nothing to download.")
return
results = []
failed=[]
with requests.Session() as session:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Soumet les tâches de téléchargement
futures = {executor.submit(download_event, uuid, events_dir, session): uuid for uuid in uuid_list}
for future in as_completed(futures):
result = future.result()
results.append(result)
if result[:6] == "Failed":
failed.append(futures[future])
if failed: # try another time with not downoaded files
uuid_list=failed
results = []
failed = []
with requests.Session() as session:
with ThreadPoolExecutor(max_workers=1) as executor:
# Soumet les tâches de téléchargement
futures = {executor.submit(download_event, uuid, events_dir, session): uuid for uuid in uuid_list}
for future in as_completed(futures):
result = future.result()
results.append(result)
if result[:6] == "Failed":
failed.append(futures[future])
if failed:
print("[!] Error : getNewIOCs -> can't download this files : ",failed)
else :
print("Download completed.")
def findUniqueUUIDs(db_list, circl_list):
"""
Find and return UUIDs that are in the CIRCL feed but not in the local database.
This function compares two lists of UUIDs and identifies the UUIDs present in circl_list
that are not in db_list.
Args:
db_list (list): A list of event UUIDs from the local database.
circl_list (list): A list of event UUIDs from the CIRCL feed.
Returns:
list: A list of UUIDs that are unique to the CIRCL feed.
"""
db_set = set(db_list)
circl_set = set(circl_list)
unique_circl = circl_set - db_set # uuids in circl and not in db
#unique_db = db_set - circl_set # uuids in db and not in circl
#common_uuids = db_set & circl_set # uuids in db and in circl
print(f" [+] Info -> findUniqueUUIDs : Unique UUIDs in CIRCL (not in DB): {len(unique_circl)}")
#print(f" [+] Info -> findUniqueUUIDs : Unique UUIDs in DB (not in CIRCL): {len(unique_db)}")
#print(f" [+] Info -> findUniqueUUIDs : Common UUIDs: {len(common_uuids)}")
return list(unique_circl)
def getDB_UUIDs(file):
"""
Retrieve event UUIDs from the local SQLite database.
This function opens a connection to the specified SQLite database using the IOC_DATABASE class,
queries for all event UUIDs, and returns them as a list.
Args:
file (str): The file path to the SQLite database.
Returns:
list: A list of event UUIDs from the database.
"""
try:
db = IOC_DATABASE(file, silent=True) # Open the database, without printing information.
db.cursor.execute("SELECT event_uuid FROM Event") # Get the list of Events
results = db.cursor.fetchall()
formatted_result = [elem[0] for elem in results] # Keep UUIDs only
print(f" [+] Info -> getDB_UUIDs : {len(formatted_result)} UUIDs found in the database.")
return formatted_result
except ValueError as e:
print(f" [!] Warning -> getDB_UUIDs : Error while getting list of UUIDs: {e}")
return []
def getCIRCL_UUIDs():
"""
Retrieve event UUIDs from the CIRCL manifest.
This function downloads the manifest.json file from the CIRCL OSInt feed, which contains
a mapping of event UUIDs to event details. It returns the list of UUIDs found in the manifest.
Returns:
list: A list of event UUIDs from the CIRCL feed. Returns an empty list if the download fails.
"""
## TO DO : make an async function to speed up the execution.
manifest_url = 'https://www.circl.lu/doc/misp/feed-osint/manifest.json'
print(f" [+] Info -> getCIRCL_UUIDs : Downloading manifest.json")
try:
manifest_response = requests.get(manifest_url, timeout=10) # 10 seconds timeout
print(f" [+] Info -> getCIRCL_UUIDs : HTTP status code: {manifest_response.status_code}")
if manifest_response.status_code == 200:
manifest = manifest_response.json()
uuids = list(manifest.keys())
print(f" [+] Info -> getCIRCL_UUIDs : {len(uuids)} UUIDs found in the CIRCL database.")
return uuids
else:
print(f" [!] Warning -> getCIRCL_UUIDs : Received status code {manifest_response.status_code}")
return []
except requests.exceptions.Timeout:
print(" [!] Error -> getCIRCL_UUIDs : The request to the manifest URL timed out.")
return []
except requests.exceptions.RequestException as e:
print(f" [!] Error -> getCIRCL_UUIDs : There was an issue with the request to the manifest URL: {e}")
return []
if __name__ == "__main__":
from utils.db import IOC_DATABASE
db_file = "../database/ioc_test.db"
db_list = getDB_UUIDs(db_file)
circl_list = getCIRCL_UUIDs()
new_uuid=findUniqueUUIDs(db_list,circl_list)
events_dir="../database/IOCs"
downloadEvents(new_uuid,events_dir)
#import os
#import json
#import aiohttp
#import asyncio
#
#async def download_event_async(session, event_filename, events_dir):
# """Télécharge de manière asynchrone un seul événement et le sauvegarde dans un fichier."""
# event_url = f'https://www.circl.lu/doc/misp/feed-osint/{event_filename}'
# file_path = os.path.join(events_dir, event_filename + ".json")
# print(f"Attempting to download: {event_url}")
#
# try:
# async with session.get(event_url, timeout=10) as response:
# print(f"HTTP status code for {event_filename}: {response.status}")
# if response.status == 200:
# data = await response.json()
# with open(file_path, 'w') as file:
# json.dump(data, file)
# print(f'Downloaded: {event_filename}')
# return True
# else:
# print(f'Failed to download: {event_filename} - Status code: {response.status}')
# return False
# except Exception as e:
# print(f"Error downloading {event_filename}: {e}")
# return False#
#
#async def download_events_async(events_dir, new_events, max_concurrent=20):
# """Télécharge de manière asynchrone plusieurs événements."""
# event_files = [event_json + ".json" for event_json in new_events]
# connector = aiohttp.TCPConnector(limit_per_host=max_concurrent)
# async with aiohttp.ClientSession(connector=connector) as session:
# tasks = []
# for event_json in new_events:
# task = asyncio.create_task(download_event_async(session, event_json, events_dir))
# tasks.append(task)
#
# results = await asyncio.gather(*tasks, return_exceptions=True)
#
# for event_json, result in zip(new_events, results):
# if isinstance(result, Exception):
# print(f"Exception occurred during download of {event_json}: {result}")
# elif not result:
# print(f"Download failed for {event_json}")
#
#def download_events_concurrently_async(events_dir, new_events, max_concurrent=20):
# """Fonction wrapper pour exécuter le téléchargement asynchrone."""
# asyncio.run(download_events_async(events_dir, new_events, max_concurrent))