#/bin/python
-- coding: utf-8 --
import argparse import calendar import csv import glob import hashlib import html import importlib import itertools import os import platform import queue import random import re import requests import shutil import signal import subprocess import sys import textwrap import threading import time import xml.etree.ElementTree as ET import zipfile import urllib.request from urllib.error import HTTPError, URLError from datetime import datetime, timedelta from html.parser import HTMLParser from pathlib import Path from io import StringIO
List of User-Agent strings
user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0", "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0" ]
List of required third-party modules
third_party_modules = [ 'chardet', 'pygame', 'bs4', # BeautifulSoup (part of bs4 package) 'tqdm', 'PySimpleGUI', 'colorama', ]
def check_and_install_modules(): os_name = platform.system()
if os_name == "Linux":
# Install pip if not already installed
try:
subprocess.check_call(["sudo", "apt", "-qq", "-y", "install", "python3-pip"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError:
print("Failed to install pip. Ensure you have sudo privileges.")
# Install python3-tk
try:
subprocess.check_call(["sudo", "apt", "-qq", "-y", "install", "python3-tk"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError:
print("Failed to install python3-tk. Ensure you have sudo privileges.")
elif os_name == "Darwin": # macOS
# Check if Tkinter is available
try:
import tkinter
print("Tkinter is available.")
except ImportError:
print("Tkinter is not available. Please install it manually or ensure your Python installation includes Tkinter.")
# Optionally, you could guide users to install Python with Tkinter:
print("You might need to reinstall Python with Tkinter support. For example, using Homebrew:")
print("brew install python --with-tcl-tk")
# For Windows, we'll rely on pip for Python packages
# Note: System packages like tkinter should be pre-installed or installed manually
for module in third_party_modules:
try:
importlib.import_module(module)
print(f"{module} is already installed.")
except ImportError:
print(f"{module} is not installed.")
pip_command = [sys.executable, '-m', 'pip', 'install', module]
try:
subprocess.check_call(pip_command)
print(f"{module} installed successfully.")
except subprocess.CalledProcessError:
print(f"Failed to install {module}.")
def import_modules(): # Your existing import logic here global chardet, concurrent, pygame, BeautifulSoup, Pool, tqdm, stop_flag, sg import chardet import concurrent.futures import pygame from bs4 import BeautifulSoup from multiprocessing import Pool from tqdm import tqdm import PySimpleGUI as sg stop_flag = threading.Event() from colorama import Fore, Style, init
Function to get a random User-Agent
def get_random_user_agent(): return random.choice(user_agents)
Define global variables and directories
failed_downloads = []
verbose = "-v" in sys.argv
edgar_url = "https://www.sec.gov/Archives/edgar/data/"
headers = {'User-Agent': "anonymous/[email protected]"}
backup_headers = {"User-Agent": "anonymost/[email protected]"}
files_found_count = 0
done = False
download_directory = os.path.join(os.path.expanduser(""), "sec_archives")
download_directory2 = os.path.join(os.path.expanduser(""), "edgar")
base_path = (download_directory2)
os.makedirs(download_directory, exist_ok=True)
os.makedirs(download_directory2, exist_ok=True)
Create a list of all subdirectories from 1993 to 2024, including all four quarters
years = range(1993, 2025) quarters = ["QTR1", "QTR2", "QTR3", "QTR4"] base_url = "https://www.sec.gov/Archives/edgar/full-index"
subdirectories = [ f"{base_url}/{year}/{quarter}/master.zip" for year in years for quarter in quarters if not (year == 20…
Comments
No comments yet
Please complete the captcha