Compare commits
6 Commits
3a1d008790
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b5f05fd0e | ||
|
|
986e7f2564 | ||
|
|
a84fa675ba | ||
|
|
7ecccda9d8 | ||
|
|
a05b3a37f0 | ||
|
|
5440f72452 |
@@ -3,4 +3,5 @@ COOKIES=""
|
|||||||
USERNAME=""
|
USERNAME=""
|
||||||
PASSWORD=""
|
PASSWORD=""
|
||||||
EMAIL=""
|
EMAIL=""
|
||||||
EMAIL_PASSWORD=""
|
EMAIL_PASSWORD=""
|
||||||
|
API_SEARCH_KEY=""
|
||||||
20
README.md
Normal file
20
README.md
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# Monkeycrawl
|
||||||
|
|
||||||
|
A small script to crawl Twitter/X that uses [twscrape](https://github.com/vladkens/twscrape).
|
||||||
|
|
||||||
|
Use at your own discretion. Do not multithread or risk getting banned or worse.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
```cmd
|
||||||
|
python -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Add cookies, accounts and passwords to `.env.local` and remove `.local` file extension.
|
||||||
|
|
||||||
|
```cmd
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
When exiting the script the file `tweets.json` will contain the scraped tweets.
|
||||||
25
main.py
25
main.py
@@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import atexit
|
||||||
import random
|
import random
|
||||||
from twscrape import API
|
from twscrape import API
|
||||||
import os
|
import os
|
||||||
@@ -11,17 +12,30 @@ from dotenv import load_dotenv
|
|||||||
load_dotenv()
|
load_dotenv()
|
||||||
OUTPUT_FILE = os.getenv("OUTPUT_FILE", "tweets.json")
|
OUTPUT_FILE = os.getenv("OUTPUT_FILE", "tweets.json")
|
||||||
|
|
||||||
|
_results = []
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def check_if_json_exists(file_path):
|
def check_if_json_exists(file_path):
|
||||||
|
logger.info(f"Checking if JSON exists at {file_path}")
|
||||||
return os.path.isfile(file_path) and os.path.getsize(file_path) > 0
|
return os.path.isfile(file_path) and os.path.getsize(file_path) > 0
|
||||||
|
|
||||||
def load_json(file_path):
|
def load_json(file_path):
|
||||||
|
logger.info(f"Loading data from {file_path}")
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
def write_json(file_path, data):
|
||||||
|
logger.info(f"Writing data to {file_path}")
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
atexit.register(lambda: write_json(OUTPUT_FILE, {"tweets": _results}))
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
global _results
|
||||||
api = API() # or API("path-to.db") – default is `accounts.db`
|
api = API() # or API("path-to.db") – default is `accounts.db`
|
||||||
# ADD ACCOUNTS (for CLI usage see next readme section)
|
# ADD ACCOUNTS (for CLI usage see next readme section)
|
||||||
|
|
||||||
@@ -32,6 +46,7 @@ async def main():
|
|||||||
password = os.getenv("PASSWORD")
|
password = os.getenv("PASSWORD")
|
||||||
email = os.getenv("EMAIL")
|
email = os.getenv("EMAIL")
|
||||||
email_password = os.getenv("EMAIL_PASSWORD")
|
email_password = os.getenv("EMAIL_PASSWORD")
|
||||||
|
api_search_key = os.getenv("API_SEARCH_KEY")
|
||||||
|
|
||||||
await api.pool.add_account(username, password, email, email_password, cookies=cookies)
|
await api.pool.add_account(username, password, email, email_password, cookies=cookies)
|
||||||
await api.pool.login_all() # try to login to receive account cookies
|
await api.pool.login_all() # try to login to receive account cookies
|
||||||
@@ -40,20 +55,16 @@ async def main():
|
|||||||
|
|
||||||
if check_if_json_exists(OUTPUT_FILE):
|
if check_if_json_exists(OUTPUT_FILE):
|
||||||
_results = load_json(OUTPUT_FILE).get("tweets", [])
|
_results = load_json(OUTPUT_FILE).get("tweets", [])
|
||||||
else:
|
|
||||||
_results = []
|
|
||||||
|
|
||||||
async for rep in api.search("AI", limit=5):
|
async for rep in api.search(api_search_key):
|
||||||
try:
|
try:
|
||||||
_results.append(rep.json())
|
_results.append(rep.json())
|
||||||
|
logger.info("Appended tweet JSON")
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.error("Failed to parse tweet JSON")
|
logger.error("Failed to parse tweet JSON")
|
||||||
|
|
||||||
await asyncio.sleep(random.uniform(7, 15)) # random delay between 7 and 15 seconds
|
|
||||||
|
|
||||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
||||||
f.write(json.dumps({"tweets": _results}, ensure_ascii=False, indent=4))
|
|
||||||
|
|
||||||
|
await asyncio.sleep(random.uniform(17, 31))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user