Compare commits

...

6 Commits

Author SHA1 Message Date
fzzinchemical
6b5f05fd0e Add API_SEARCH_KEY to environment variables and update main functionality to use it for searches 2025-11-13 22:27:49 +01:00
fzzinchemical
986e7f2564 Added README.md 2025-11-13 22:26:27 +01:00
fzzinchemical
a84fa675ba Remove unused signal import from main.py 2025-11-13 20:58:04 +01:00
fzzinchemical
7ecccda9d8 Fix atexit registration for JSON writing to ensure data is saved on exit 2025-11-13 20:50:07 +01:00
fzzinchemical
a05b3a37f0 Update random sleep duration in main functionality for API rate limiting 2025-11-13 20:49:16 +01:00
fzzinchemical
5440f72452 Add atexit handling and improve JSON logging in main functionality 2025-11-13 20:43:26 +01:00
3 changed files with 40 additions and 8 deletions

View File

@@ -3,4 +3,5 @@ COOKIES=""
USERNAME="" USERNAME=""
PASSWORD="" PASSWORD=""
EMAIL="" EMAIL=""
EMAIL_PASSWORD="" EMAIL_PASSWORD=""
API_SEARCH_KEY=""

20
README.md Normal file
View File

@@ -0,0 +1,20 @@
# Monkeycrawl
A small script to crawl Twitter/X that uses [twscrape](https://github.com/vladkens/twscrape).
Use at your own discretion. Do not multithread or risk getting banned or worse.
## Usage
```cmd
python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
```
Add cookies, accounts and passwords to `.env.local` and remove `.local` file extension.
```cmd
python main.py
```
When exiting the script the file `tweets.json` will contain the scraped tweets.

25
main.py
View File

@@ -1,4 +1,5 @@
import asyncio import asyncio
import atexit
import random import random
from twscrape import API from twscrape import API
import os import os
@@ -11,17 +12,30 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
OUTPUT_FILE = os.getenv("OUTPUT_FILE", "tweets.json") OUTPUT_FILE = os.getenv("OUTPUT_FILE", "tweets.json")
_results = []
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def check_if_json_exists(file_path): def check_if_json_exists(file_path):
logger.info(f"Checking if JSON exists at {file_path}")
return os.path.isfile(file_path) and os.path.getsize(file_path) > 0 return os.path.isfile(file_path) and os.path.getsize(file_path) > 0
def load_json(file_path): def load_json(file_path):
logger.info(f"Loading data from {file_path}")
with open(file_path, "r", encoding="utf-8") as f: with open(file_path, "r", encoding="utf-8") as f:
return json.load(f) return json.load(f)
def write_json(file_path, data):
logger.info(f"Writing data to {file_path}")
with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
atexit.register(lambda: write_json(OUTPUT_FILE, {"tweets": _results}))
async def main(): async def main():
global _results
api = API() # or API("path-to.db") default is `accounts.db` api = API() # or API("path-to.db") default is `accounts.db`
# ADD ACCOUNTS (for CLI usage see next readme section) # ADD ACCOUNTS (for CLI usage see next readme section)
@@ -32,6 +46,7 @@ async def main():
password = os.getenv("PASSWORD") password = os.getenv("PASSWORD")
email = os.getenv("EMAIL") email = os.getenv("EMAIL")
email_password = os.getenv("EMAIL_PASSWORD") email_password = os.getenv("EMAIL_PASSWORD")
api_search_key = os.getenv("API_SEARCH_KEY")
await api.pool.add_account(username, password, email, email_password, cookies=cookies) await api.pool.add_account(username, password, email, email_password, cookies=cookies)
await api.pool.login_all() # try to login to receive account cookies await api.pool.login_all() # try to login to receive account cookies
@@ -40,20 +55,16 @@ async def main():
if check_if_json_exists(OUTPUT_FILE): if check_if_json_exists(OUTPUT_FILE):
_results = load_json(OUTPUT_FILE).get("tweets", []) _results = load_json(OUTPUT_FILE).get("tweets", [])
else:
_results = []
async for rep in api.search("AI", limit=5): async for rep in api.search(api_search_key):
try: try:
_results.append(rep.json()) _results.append(rep.json())
logger.info("Appended tweet JSON")
except Exception: except Exception:
logger.error("Failed to parse tweet JSON") logger.error("Failed to parse tweet JSON")
await asyncio.sleep(random.uniform(7, 15)) # random delay between 7 and 15 seconds
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
f.write(json.dumps({"tweets": _results}, ensure_ascii=False, indent=4))
await asyncio.sleep(random.uniform(17, 31))
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())