feat: implement binance p2p collector daemon
Set up continuous P2P VES/USDT market history data collection, normalization, validation, and date-partitioned Parquet storage.
This commit is contained in:
parent
8e7a77fe61
commit
2c41a7a6b3
19 changed files with 2313 additions and 0 deletions
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
.venv/
|
||||
venv/
|
||||
ENV/
|
||||
p2p-collector/data/
|
||||
*.log
|
||||
*.tmp
|
||||
*.alert
|
||||
checkpoint.json
|
||||
760
base_plan.md
Normal file
760
base_plan.md
Normal file
|
|
@ -0,0 +1,760 @@
|
|||
# Binance P2P Data Collection — Detailed Implementation Spec
|
||||
|
||||
> **Purpose:** This document is the single source of truth for the data collection phase. Every field, every endpoint, every edge case is specified so a coder can implement without ambiguity.
|
||||
>
|
||||
> **Status:** Phase 1 — Data Collection only. No ML. No trading. No algorithm decisions yet.
|
||||
|
||||
---
|
||||
|
||||
## 1. The Core Loop (Exact Pseudocode)
|
||||
|
||||
```
|
||||
while True:
|
||||
try:
|
||||
buy_snap = fetch_all_ads(tradeType="BUY", asset="USDT", fiat="VES")
|
||||
sell_snap = fetch_all_ads(tradeType="SELL", asset="USDT", fiat="VES")
|
||||
|
||||
flat_buy = [normalize_ad(ad, "BUY", now_utc) for ad in buy_snap]
|
||||
flat_sell = [normalize_ad(ad, "SELL", now_utc) for ad in sell_snap]
|
||||
|
||||
validate_snapshot(flat_buy + flat_sell)
|
||||
|
||||
store_parquet(flat_buy, base_path / "raw" / "buy_ads" / date_partition)
|
||||
store_parquet(flat_sell, base_path / "raw" / "sell_ads" / date_partition)
|
||||
|
||||
log_success(len(flat_buy), len(flat_sell), elapsed)
|
||||
|
||||
except Exception as e:
|
||||
log_error(e, consecutive_failures)
|
||||
consecutive_failures += 1
|
||||
if consecutive_failures >= 5:
|
||||
write_alert_file() # human needs to check
|
||||
|
||||
sleep(jitter(interval_seconds)) # default 300s ± 10%
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. API Client — Exact Implementation
|
||||
|
||||
### 2.1 Endpoint
|
||||
|
||||
```
|
||||
POST https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search
|
||||
```
|
||||
|
||||
**No API key.** This is fully public.
|
||||
|
||||
### 2.2 Headers
|
||||
|
||||
| Header | Value |
|
||||
|---|---|
|
||||
| `Content-Type` | `application/json` |
|
||||
| `User-Agent` | `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36` |
|
||||
| `Accept` | `*/*` |
|
||||
| `Origin` | `https://p2p.binance.com` |
|
||||
| `Referer` | `https://p2p.binance.com/` |
|
||||
|
||||
### 2.3 Request Body (BUY example)
|
||||
|
||||
```json
|
||||
{
|
||||
"asset": "USDT",
|
||||
"fiat": "VES",
|
||||
"tradeType": "BUY",
|
||||
"page": 1,
|
||||
"rows": 20,
|
||||
"payTypes": [],
|
||||
"countries": [],
|
||||
"publisherType": null,
|
||||
"classify": "personal",
|
||||
"filter": {}
|
||||
}
|
||||
```
|
||||
|
||||
**Key notes for the coder:**
|
||||
- `tradeType: "BUY"` = advertiser wants to **give you VES** in exchange for your USDT. They are *buying* USDT from you.
|
||||
- `tradeType: "SELL"` = advertiser wants to **give you USDT** in exchange for your VES. They are *selling* USDT to you.
|
||||
- `payTypes: []` = no filter, return all payment methods
|
||||
- `rows: 20` = Binance's max per page (do not change)
|
||||
- `publisherType: null` = both merchants and regular users
|
||||
- `classify: "personal"` = personal ads (not business) — covers the P2P marketplace
|
||||
|
||||
### 2.4 Pagination Logic
|
||||
|
||||
```python
|
||||
def fetch_all_ads(trade_type, asset, fiat, max_pages=10):
|
||||
all_ads = []
|
||||
|
||||
for page in range(1, max_pages + 1):
|
||||
body = {
|
||||
"asset": asset,
|
||||
"fiat": fiat,
|
||||
"tradeType": trade_type,
|
||||
"page": page,
|
||||
"rows": 20,
|
||||
"payTypes": [],
|
||||
"countries": [],
|
||||
"publisherType": None,
|
||||
"classify": "personal",
|
||||
"filter": {}
|
||||
}
|
||||
|
||||
resp = httpx.post(URL, json=body, headers=HEADERS, timeout=15)
|
||||
resp.raise_for_status()
|
||||
|
||||
data = resp.json()
|
||||
|
||||
if not data.get("success"):
|
||||
raise APIError(f"API returned success=false: {data}")
|
||||
|
||||
ads = data.get("data", [])
|
||||
total = data.get("total", 0)
|
||||
|
||||
all_ads.extend(ads)
|
||||
|
||||
# Stop if we've collected all available ads
|
||||
if len(all_ads) >= total:
|
||||
break
|
||||
|
||||
# Don't request a page that starts beyond total ads
|
||||
if page * 20 >= total:
|
||||
break
|
||||
|
||||
if page < max_pages:
|
||||
time.sleep(0.5) # 500ms between pages
|
||||
|
||||
return all_ads
|
||||
```
|
||||
|
||||
### 2.5 Rate Limiting — Defensive Strategy
|
||||
|
||||
| Event | Wait time | Notes |
|
||||
|---|---|---|
|
||||
| Between pages (same snapshot) | 500 ms | Fixed |
|
||||
| Between snapshots (BUY → SELL) | 1 second | Fixed |
|
||||
| Between full cycles | 300 s ± 30s | Jittered to avoid clock sync |
|
||||
| HTTP 429 (rate limited) | 60s → 120s → 240s → 480s | Exponential backoff, cap at 480s |
|
||||
| Connection error | 30s retry | Transient network issues |
|
||||
| 5xx server error | 60s retry | Binance server-side issues |
|
||||
|
||||
**Important:** After a 429, reset the backoff after one successful full snapshot.
|
||||
|
||||
### 2.6 Proxy Support (Optional — keep simple first)
|
||||
|
||||
Start with **no proxy**, direct from VPS. Only add proxy rotation if we hit rate limits. Binance rarely rate-limits P2P at 1 request/5min.
|
||||
|
||||
---
|
||||
|
||||
## 3. Normalization — Exact Field Mapping
|
||||
|
||||
### 3.1 The Flattened Schema (one row = one ad)
|
||||
|
||||
| # | Output field | Type | JSON path | Notes |
|
||||
|---|---|---|---|---|
|
||||
| 1 | `snapshot_id` | string | auto: `{fetch_ts_iso}_{trade_type}` | e.g. `"20260605T133000Z_BUY"` |
|
||||
| 2 | `fetched_at` | datetime | auto: now_utc | Always UTC |
|
||||
| 3 | `fetched_date` | string | auto: YYYY-MM-DD | Partition column |
|
||||
| 4 | `trade_type` | string | `adv.tradeType` | "BUY" or "SELL" |
|
||||
| 5 | `adv_no` | string | `adv.advNo` | Unique ad ID |
|
||||
| 6 | `asset` | string | `adv.asset` | "USDT" |
|
||||
| 7 | `fiat` | string | `adv.fiatUnit` | "VES" |
|
||||
| 8 | `price` | float | `adv.price` | Parse as float |
|
||||
| 9 | `surplus_amount` | float | `adv.surplusAmount` | Remaining USDT |
|
||||
| 10 | `min_amount` | float | `adv.minSingleTransAmount` | Min USDT per trade |
|
||||
| 11 | `max_amount` | float | `adv.maxSingleTransAmount` | Max USDT per trade |
|
||||
| 12 | `tradable_quantity` | float | `adv.tradableQuantity` | Same as surplus? |
|
||||
| 13 | `advertiser_no` | string | `advertiser.userNo` | **Stable ID** — use this |
|
||||
| 14 | `advertiser_name` | string | `advertiser.nickName` | For reference only |
|
||||
| 15 | `advertiser_type` | string | `advertiser.userType` | "merchant" or "user" |
|
||||
| 16 | `month_order_count` | int | `advertiser.monthOrderCount` | |
|
||||
| 17 | `month_finish_rate` | float | `advertiser.monthFinishRate` | 0.0 to 1.0 |
|
||||
| 18 | `positive_rate` | float | `advertiser.positiveRate` | 0.0 to 1.0 |
|
||||
| 19 | `user_positive_rate` | float | `advertiser.userPositiveRate` | older field, same idea |
|
||||
| 20 | `payment_methods` | list[str] | `adv.tradeMethods[].payType` | e.g. `["BANESCO", "PAGO_MOVIL"]` |
|
||||
| 21 | `payment_method_ids` | list[str] | `adv.tradeMethods[].identifier` | e.g. `["Banco_Banesco", "Pago_Movil"]` |
|
||||
| 22 | `ad_created_at` | datetime | `adv.createTime` | Unix millisecond → datetime |
|
||||
| 23 | `price_type` | string | `adv.priceType` | Usually "FIXED" |
|
||||
|
||||
### 3.2 JSON Path Details (nested structure)
|
||||
|
||||
The API response has this structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"adv": {
|
||||
"advNo": "6f8b2e...",
|
||||
"tradeType": "BUY",
|
||||
"asset": "USDT",
|
||||
"fiatUnit": "VES",
|
||||
"price": "58.50",
|
||||
"surplusAmount": "1520.43",
|
||||
"maxSingleTransAmount": "5000.00",
|
||||
"minSingleTransAmount": "100.00",
|
||||
"tradableQuantity": "1520.43",
|
||||
"createTime": 1749128400000,
|
||||
"fiatSymbol": "Bs",
|
||||
"priceType": "FIXED",
|
||||
"tradeMethods": [
|
||||
{
|
||||
"identifier": "Banco_Banesco",
|
||||
"payType": "BANESCO",
|
||||
"payMethodId": "BANESCO"
|
||||
},
|
||||
{
|
||||
"identifier": "Pago_Movil",
|
||||
"payType": "PAGO_MOVIL",
|
||||
"payMethodId": "PAGO_MOVIL"
|
||||
}
|
||||
]
|
||||
},
|
||||
"advertiser": {
|
||||
"userNo": "ABC123",
|
||||
"nickName": "CryptoTraderVE",
|
||||
"userType": "merchant",
|
||||
"monthOrderCount": 342,
|
||||
"monthFinishRate": 0.97,
|
||||
"positiveRate": 0.99,
|
||||
"userPositiveRate": 0.99
|
||||
}
|
||||
}
|
||||
],
|
||||
"total": 156,
|
||||
"pageSize": 20,
|
||||
"success": true
|
||||
}
|
||||
```
|
||||
|
||||
### 3.3 Normalization Code Sketch
|
||||
|
||||
```python
|
||||
def normalize_ad(raw_ad: dict, trade_type: str, fetched_at: datetime) -> dict:
|
||||
adv = raw_ad["adv"]
|
||||
adver = raw_ad["advertiser"]
|
||||
|
||||
payment_methods = [m["payType"] for m in adv.get("tradeMethods", [])]
|
||||
payment_method_ids = [m["identifier"] for m in adv.get("tradeMethods", [])]
|
||||
|
||||
return {
|
||||
"snapshot_id": f"{fetched_at.strftime('%Y%m%dT%H%M%SZ')}_{trade_type}",
|
||||
"fetched_at": fetched_at,
|
||||
"fetched_date": fetched_at.strftime("%Y-%m-%d"),
|
||||
"trade_type": trade_type,
|
||||
"adv_no": adv["advNo"],
|
||||
"asset": adv["asset"],
|
||||
"fiat": adv["fiatUnit"],
|
||||
"price": float(adv["price"]),
|
||||
"surplus_amount": float(adv.get("surplusAmount", 0)),
|
||||
"min_amount": float(adv.get("minSingleTransAmount", 0)),
|
||||
"max_amount": float(adv.get("maxSingleTransAmount", 0)),
|
||||
"tradable_quantity": float(adv.get("tradableQuantity", 0)),
|
||||
"advertiser_no": adver["userNo"],
|
||||
"advertiser_name": adver["nickName"],
|
||||
"advertiser_type": adver.get("userType", "user"),
|
||||
"month_order_count": adver.get("monthOrderCount", 0),
|
||||
"month_finish_rate": float(adver.get("monthFinishRate", 0)),
|
||||
"positive_rate": float(adver.get("positiveRate", 0)),
|
||||
"user_positive_rate": float(adver.get("userPositiveRate", 0)),
|
||||
"payment_methods": payment_methods, # e.g. ["BANESCO", "PAGO_MOVIL"]
|
||||
"payment_method_ids": payment_method_ids, # e.g. ["Banco_Banesco", "Pago_Movil"]
|
||||
"ad_created_at": datetime.fromtimestamp(
|
||||
adv["createTime"] / 1000, tz=timezone.utc
|
||||
),
|
||||
"price_type": adv.get("priceType", "FIXED"),
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Payment Methods — The Critical Column
|
||||
|
||||
### 4.1 Known Payment Method Identifiers for Venezuela
|
||||
|
||||
| `payType` value | `identifier` value | Common name |
|
||||
|---|---|---|
|
||||
| `BANESCO` | `Banco_Banesco` | Banesco bank transfer |
|
||||
| `MERCANTIL` | `Banco_Mercantil` | Mercantil bank transfer |
|
||||
| `PROVINCIAL` | `Banco_Provincial` | Banco Provincial (BBVA) |
|
||||
| `VENEZUELA` | `Banco_De_Venezuela` | Banco de Venezuela (BDV) |
|
||||
| `BANCO_NACIONAL_CREDITO` | `Banco_Nacional_De_Credito` | BNC |
|
||||
| `SOFITASA` | `Sofitasa` | Sofitasa |
|
||||
| `BANCAMIGA` | `Bancamiga` | Bancamiga |
|
||||
| `BANCO_EXTERIOR` | `Banco_Exterior` | Banco Exterior |
|
||||
| `BANCO_OCCIDENTE` | `Banco_Occidente` | Banco Occidental de Descuento (BOD) |
|
||||
| `BANCO_PLATA` | `Banco_Plata` | Banco Plaza |
|
||||
| `BANESCO_PERSONAL` | `Banesco_Personal` | Banesco personal account |
|
||||
| `PAGO_MOVIL` | `Pago_Movil` | Mobile payment (inter-bank) |
|
||||
| `BANCANET` | `Bancanet` | Bancanet |
|
||||
| `BANPLUS` | `Banplus` | Banplus |
|
||||
| `ZELLE` | `Zelle` | Zelle (USD, not VES) |
|
||||
| `PAYPAL` | `Paypal` | PayPal (USD) |
|
||||
| `CASH_VEF` | `Efectivo_VEF` | Cash in VES |
|
||||
| `CASH_USD` | `Efectivo_USD` | Cash in USD |
|
||||
| `PAGO_MOVIL` | `Pago_Movil_Banco_Venezuela` | Mobile payment at specific bank |
|
||||
|
||||
### 4.2 Why This Matters for Bank Arbitrage
|
||||
|
||||
```python
|
||||
# Example analysis query after ~1 week of data:
|
||||
# For each snapshot, find the best path:
|
||||
#
|
||||
# Best BUY price (sell USDT → get VES): Banesco, 60.50 VES/USDT
|
||||
# Best SELL price (buy USDT → give VES): Mercantil, 62.30 VES/USDT
|
||||
# Gross arbitrage: 62.30 - 60.50 = 1.80 VES/USDT = ~2.9% spread
|
||||
#
|
||||
# If same bank: you lose 0% on internal transfer
|
||||
# If different banks: you lose bank transfer fee (maybe 0.5%)
|
||||
# Net profit = 2.9% - 0.5% = 2.4% per round trip
|
||||
```
|
||||
|
||||
### 4.3 Storage Consideration
|
||||
|
||||
`payment_methods` is a **list of strings** — this is fine in Parquet (stored as a repeated field). For CSV it would need to be JSON-encoded or one-hot encoded later.
|
||||
|
||||
---
|
||||
|
||||
## 5. Storage — Exact File Layout
|
||||
|
||||
```
|
||||
/path/to/data/
|
||||
├── raw/
|
||||
│ ├── buy_ads/
|
||||
│ │ └── year=2026/
|
||||
│ │ └── month=06/
|
||||
│ │ └── day=05/
|
||||
│ │ ├── snapshot_20260605_133000.parquet
|
||||
│ │ ├── snapshot_20260605_133500.parquet
|
||||
│ │ └── ...
|
||||
│ ├── sell_ads/
|
||||
│ │ └── year=2026/
|
||||
│ │ └── month=06/
|
||||
│ │ └── day=05/
|
||||
│ │ ├── snapshot_20260605_133000.parquet
|
||||
│ │ └── ...
|
||||
│ └── daily_merged/ <-- OPTIONAL: daily combined view
|
||||
│ └── year=2026/
|
||||
│ └── month=06/
|
||||
│ └── 2026-06-05.parquet
|
||||
│
|
||||
├── logs/
|
||||
│ └── collector_20260605.log
|
||||
│
|
||||
├── alerts/ <-- alert marker files go here
|
||||
│ └── (empty if no issues)
|
||||
│
|
||||
└── checkpoint.json <-- for restart resilience
|
||||
```
|
||||
|
||||
### 5.1 File Naming Convention
|
||||
|
||||
**Snapshot files:** `snapshot_{YYYYMMDD}_{HHMMSS}.parquet`
|
||||
- Time used: the start timestamp of the snapshot (UTC)
|
||||
- Example: `snapshot_20260605_133000.parquet`
|
||||
|
||||
**Why no UUIDs?** The timestamp + trade_type partition is already unique. No repeated names unless you run two collectors (don't).
|
||||
|
||||
### 5.2 Atomic Writes (No Partial Files)
|
||||
|
||||
```python
|
||||
def store_parquet(rows, base_dir, fetched_at):
|
||||
if not rows:
|
||||
return
|
||||
|
||||
# Build partition path from timestamp
|
||||
year = fetched_at.strftime("%Y")
|
||||
month = fetched_at.strftime("%m")
|
||||
day = fetched_at.strftime("%d")
|
||||
filename = f"snapshot_{fetched_at.strftime('%Y%m%d_%H%M%S')}.parquet"
|
||||
|
||||
dest_dir = Path(base_dir) / f"year={year}" / f"month={month}" / f"day={day}"
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write to temp file first
|
||||
tmp_path = dest_dir / (filename + ".tmp")
|
||||
final_path = dest_dir / filename
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
df.to_parquet(tmp_path, index=False, engine="pyarrow")
|
||||
|
||||
# Atomic rename
|
||||
tmp_path.rename(final_path)
|
||||
```
|
||||
|
||||
### 5.3 Schema Consistency Check
|
||||
|
||||
Each snapshot should write a schema marker file once:
|
||||
|
||||
```python
|
||||
# After first successful write per partition, write schema.parquet as a reference
|
||||
schema_path = dest_dir / "_schema.parquet"
|
||||
if not schema_path.exists():
|
||||
df.iloc[:0].to_parquet(schema_path) # empty DataFrame with same schema
|
||||
```
|
||||
|
||||
This allows downstream readers to discover the schema without reading a full snapshot.
|
||||
|
||||
---
|
||||
|
||||
## 6. Data Validation During Collection
|
||||
|
||||
### 6.1 Row-Level Rejection Rules
|
||||
|
||||
Reject (skip, don't crash) individual ads if:
|
||||
|
||||
| Condition | Why | Action |
|
||||
|---|---|---|
|
||||
| `price` is None or ≤ 0 | Bad data | Log warning, skip |
|
||||
| `surplusAmount` is None or ≤ 0 | Ad has no USDT left | Log debug, skip |
|
||||
| `monthFinishRate` is 0.0 and `monthOrderCount` > 0 | Merchant hasn't completed any orders (suspicious) | Log warning, skip |
|
||||
| `price` < 1.0 or `price` > 500.0 | Way outside VES/USDT normal range (should be ~50–150) | Log warning, skip this ad |
|
||||
| Empty `advNo` | Missing identifier | Log error, skip |
|
||||
| Duplicate `advNo` within same snapshot | Possible API glitch | Log warning, keep first occurrence |
|
||||
|
||||
### 6.2 Snapshot-Level Validation
|
||||
|
||||
After collecting all ads for one snapshot:
|
||||
|
||||
```
|
||||
✅ TOTAL ADS: BUY=47 SELL=53 (should be 20-200 each)
|
||||
✅ PRICE RANGE: BUY [54.20 - 62.80] SELL [58.00 - 68.50]
|
||||
(SELL should be consistently higher than BUY)
|
||||
If not: LOG WARNING "BUY/SELL overlap detected"
|
||||
✅ SPREAD: SELL_min - BUY_max = 58.00 - 62.80 = -4.80
|
||||
(If negative: spread is inverted — unusual but possible)
|
||||
Log: "Current spread: {spread:.2f} VES/USDT"
|
||||
✅ MEDIAN PRICE: BUY=58.30 SELL=63.50
|
||||
✅ AD STALENESS: 0 ads with createTime > 7 days old
|
||||
(If any: they're stale, still keep them, but log it)
|
||||
✅ EMPTY SNAPSHOT: If BUY=0 AND SELL=0 → CRITICAL ALERT
|
||||
```
|
||||
|
||||
### 6.3 Snapshot Summary Log Line (one line per snapshot)
|
||||
|
||||
```
|
||||
2026-06-05 13:30:00 UTC | BUY=47 ads [54.20–62.80] SELL=53 ads [58.00–68.50] | spread= -4.80 | took 3.2s | methods=[BANESCO,PAGO_MOVIL,MERCANTIL,...]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Scheduling & Lifecycle
|
||||
|
||||
### 7.1 Startup Behavior
|
||||
|
||||
```
|
||||
1. Read checkpoint.json (if exists)
|
||||
→ "last_completed_snapshot": "2026-06-05T13:25:00Z"
|
||||
→ Wait until (last_completed + interval) before starting
|
||||
→ If checkpoint is missing or corrupted, start immediately
|
||||
|
||||
2. Verify data directory is writable
|
||||
→ Try writing a test file, then delete it
|
||||
|
||||
3. Log: "Starting collector. Interval=300s. Pairs=USDT/VES"
|
||||
```
|
||||
|
||||
### 7.2 Graceful Shutdown
|
||||
|
||||
```python
|
||||
import signal
|
||||
|
||||
running = True
|
||||
|
||||
def handle_signal(sig, frame):
|
||||
global running
|
||||
logging.info("Received signal %s, finishing current snapshot...", sig)
|
||||
running = False
|
||||
|
||||
signal.signal(signal.SIGINT, handle_signal)
|
||||
signal.signal(signal.SIGTERM, handle_signal)
|
||||
|
||||
# In main loop:
|
||||
while running:
|
||||
# ... do snapshot ...
|
||||
# Write checkpoint after each successful snapshot
|
||||
write_checkpoint({"last_completed_snapshot": now_utc.isoformat()})
|
||||
```
|
||||
|
||||
### 7.3 Checkpoint File Format
|
||||
|
||||
```json
|
||||
{
|
||||
"last_completed_snapshot": "2026-06-05T13:30:00Z",
|
||||
"last_buy_ad_count": 47,
|
||||
"last_sell_ad_count": 53,
|
||||
"consecutive_failures": 0,
|
||||
"total_snapshots": 284,
|
||||
"first_snapshot": "2026-06-01T00:00:00Z",
|
||||
"version": "1.0"
|
||||
}
|
||||
```
|
||||
|
||||
### 7.4 Alert Marker File
|
||||
|
||||
After 5 consecutive failures, write:
|
||||
|
||||
```
|
||||
/path/to/data/alerts/20260605_133000_5_failures.alert
|
||||
```
|
||||
|
||||
Content:
|
||||
```json
|
||||
{
|
||||
"timestamp": "2026-06-05T13:30:00Z",
|
||||
"error": "HTTP 500 after 3 retries",
|
||||
"consecutive_failures": 5,
|
||||
"traceback": "..."
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. First-Run Verification Protocol
|
||||
|
||||
After the collector writes its **first snapshot**, the coder should manually verify:
|
||||
|
||||
### Step 1: Read the Parquet file back
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
df = pd.read_parquet("data/raw/buy_ads/year=2026/month=06/day=05/snapshot_20260605_133000.parquet")
|
||||
df.info()
|
||||
df.head()
|
||||
```
|
||||
|
||||
Check:
|
||||
- [ ] All columns present (23 columns from spec)
|
||||
- [ ] No null values in critical fields (price, adv_no, advertiser_no)
|
||||
- [ ] `price` is float type, not string
|
||||
- [ ] `fetched_at` is datetime type
|
||||
- [ ] `payment_methods` is a proper list column
|
||||
|
||||
### Step 2: Verify BUY vs SELL logic
|
||||
|
||||
```python
|
||||
buy_ads = df[df["trade_type"] == "BUY"]
|
||||
sell_ads = df[df["trade_type"] == "SELL"]
|
||||
|
||||
print(f"BUY ads count: {len(buy_ads)}")
|
||||
print(f"SELL ads count: {len(sell_ads)}")
|
||||
print(f"BUY price range: {buy_ads['price'].min():.2f} - {buy_ads['price'].max():.2f}")
|
||||
print(f"SELL price range: {sell_ads['price'].min():.2f} - {sell_ads['price'].max():.2f}")
|
||||
```
|
||||
|
||||
Expected: SELL prices are higher than BUY prices (advertiser selling USDT charges a premium vs. buying USDT).
|
||||
|
||||
### Step 3: Verify payment methods are captured
|
||||
|
||||
```python
|
||||
all_methods = set()
|
||||
for methods in df["payment_methods"]:
|
||||
all_methods.update(methods)
|
||||
print(f"Payment methods found: {sorted(all_methods)}")
|
||||
```
|
||||
|
||||
Expected: At least BANESCO and PAGO_MOVIL will appear. Possibly 5–15 different banks.
|
||||
|
||||
### Step 4: Verify advertiser diversity
|
||||
|
||||
```python
|
||||
print(f"Unique advertisers: {df['advertiser_no'].nunique()}")
|
||||
print(f"Merchants: {(df['advertiser_type'] == 'merchant').sum()}")
|
||||
print(f"Users: {(df['advertiser_type'] == 'user').sum()}")
|
||||
```
|
||||
|
||||
### Step 5: Run the collector for 1 hour (~12 snapshots) and verify:
|
||||
|
||||
```bash
|
||||
ls data/raw/buy_ads/year=2026/month=06/day=05/ | wc -l
|
||||
# Should be ~12
|
||||
```
|
||||
|
||||
- [ ] No duplicate timestamps
|
||||
- [ ] No gaps > 6 minutes
|
||||
- [ ] No crash/restart in the logs
|
||||
|
||||
---
|
||||
|
||||
## 9. File & Module Structure (Exact)
|
||||
|
||||
```
|
||||
p2p-collector/
|
||||
├── collect_p2p.py # Entry point: argument parsing, main loop
|
||||
├── config.yaml # All configurable settings
|
||||
├── binance_client.py # fetch_all_ads(), pagination, rate limiting
|
||||
├── normalizer.py # normalize_ad(), flatten schema
|
||||
├── storage.py # store_parquet(), atomic writes, checkpoint
|
||||
├── validator.py # validate_row(), validate_snapshot()
|
||||
├── scheduler.py # main loop, sleep/jitter, signal handling
|
||||
├── alert.py # write_alert_file(), logging setup
|
||||
├── utils.py # jitter(), datetime helpers
|
||||
├── requirements.txt # pinned versions
|
||||
├── Makefile # setup, run, clean, test commands
|
||||
├── tests/
|
||||
│ ├── test_normalizer.py # Test with sample API response
|
||||
│ ├── test_storage.py # Test atomic writes
|
||||
│ └── test_validator.py # Test rejection rules
|
||||
├── sample_responses/
|
||||
│ ├── response_buy.json # One real-ish API response for tests
|
||||
│ └── response_sell.json
|
||||
└── README.md # Run instructions
|
||||
```
|
||||
|
||||
### requirements.txt
|
||||
|
||||
```
|
||||
httpx>=0.27,<1.0
|
||||
pandas>=2.0,<3.0
|
||||
pyarrow>=14.0,<16.0
|
||||
pyyaml>=6.0,<7.0
|
||||
```
|
||||
|
||||
Note: `httpx` over `requests` because it has native timeout support, cleaner API. Fall back to `requests` if the coder prefers.
|
||||
|
||||
---
|
||||
|
||||
## 10. `config.yaml` — Complete Reference
|
||||
|
||||
```yaml
|
||||
binance:
|
||||
base_url: "https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search"
|
||||
user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
timeout_seconds: 15
|
||||
max_pages: 10
|
||||
request_delay_seconds: 0.5
|
||||
|
||||
collection:
|
||||
pairs:
|
||||
- asset: "USDT"
|
||||
fiat: "VES"
|
||||
interval_seconds: 300
|
||||
output_dir: "./data/raw"
|
||||
retry_attempts: 3
|
||||
retry_delay_base_seconds: 10
|
||||
|
||||
validation:
|
||||
price_min: 1.0
|
||||
price_max: 500.0
|
||||
reject_zero_finish_rate: true
|
||||
reject_zero_surplus: true
|
||||
|
||||
logging:
|
||||
level: "INFO"
|
||||
file: "./data/logs/collector.log"
|
||||
max_bytes: 10485760 # 10 MB
|
||||
backup_count: 5
|
||||
format: "%(asctime)s | %(levelname)s | %(message)s"
|
||||
|
||||
alerts:
|
||||
consecutive_failure_threshold: 5
|
||||
alert_dir: "./data/alerts"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 11. Run Modes
|
||||
|
||||
### Mode 1: One-shot test
|
||||
|
||||
```bash
|
||||
python collect_p2p.py --once
|
||||
```
|
||||
|
||||
- Fetches one BUY snapshot + one SELL snapshot
|
||||
- Writes to disk
|
||||
- Prints summary
|
||||
- Exits
|
||||
- Used for: first run, testing, debugging
|
||||
|
||||
### Mode 2: Daemon (continuous)
|
||||
|
||||
```bash
|
||||
python collect_p2p.py
|
||||
```
|
||||
|
||||
- Runs forever
|
||||
- Loop with interval
|
||||
- Graceful shutdown on Ctrl+C
|
||||
|
||||
### Mode 3: Backfill (future)
|
||||
|
||||
```bash
|
||||
python collect_p2p.py --backfill --start=2026-06-01 --end=2026-06-03
|
||||
```
|
||||
|
||||
- Not needed now
|
||||
- Architecture supports it later
|
||||
|
||||
### Mode 4: Validate-only
|
||||
|
||||
```bash
|
||||
python collect_p2p.py --validate data/raw/buy_ads/year=2026/month=06/day=05/
|
||||
```
|
||||
|
||||
- Reads Parquet files
|
||||
- Runs validation checks
|
||||
- Prints report
|
||||
- No API calls
|
||||
|
||||
---
|
||||
|
||||
## 12. Testing the Coder's Work
|
||||
|
||||
Hand this checklist to the coder when they say "it's done":
|
||||
|
||||
| # | Test | How |
|
||||
|---|---|---|
|
||||
| 1 | **API connectivity** | `python collect_p2p.py --once` returns ads without error |
|
||||
| 2 | **Pagination works** | Inspect: total ads fetched vs `total` field from API |
|
||||
| 3 | **Both BUY and SELL** | Both directories have at least one file after `--once` |
|
||||
| 4 | **Schema correct** | `pd.read_parquet(file)` → 23 columns, correct dtypes |
|
||||
| 5 | **Payment methods populated** | At least 3 payment methods in the first snapshot |
|
||||
| 6 | **Atomic write** | Kill the process mid-write (SIGKILL), no partial files remain. Only `.tmp` files |
|
||||
| 7 | **Graceful shutdown** | Ctrl+C during a snapshot → clean exit, last snapshot saved |
|
||||
| 8 | **Restart resilience** | Start collector, kill it, restart → resumes without duplicate timestamps |
|
||||
| 9 | **Rate limiting** | No HTTP 429 in logs after 1 hour of continuous running |
|
||||
| 10 | **Storage efficiency** | 1 hour of data ≤ 3 MB total on disk |
|
||||
|
||||
---
|
||||
|
||||
## 13. Post-Collection — What the Data Will Look Like After One Week
|
||||
|
||||
| Metric | Expected value |
|
||||
|---|---|
|
||||
| Snapshots collected | ~2,016 (7 days × 288 snapshots/day) |
|
||||
| Total raw ads | ~200,000–400,000 rows |
|
||||
| Storage used | ~20–100 MB |
|
||||
| Unique advertisers | 100–500 |
|
||||
| Unique payment methods | 10–20 |
|
||||
| Price range (BUY) | ~55–65 VES/USDT (fluctuates with parallel dollar) |
|
||||
| Price range (SELL) | ~58–70 VES/USDT |
|
||||
| Typical spread | ~2–6 VES/USDT (3–10%) |
|
||||
|
||||
**After 1 week of collection, we stop and do EDA before any ML decisions.**
|
||||
|
||||
---
|
||||
|
||||
## 14. Known Gotchas / FAQ for the Coder
|
||||
|
||||
**Q: What if the API returns different fields than documented?**
|
||||
A: The normalizer should use `.get()` with defaults for every field. Log a warning if a field is missing that we expected. Don't crash.
|
||||
|
||||
**Q: What if `tradeMethods` is empty?**
|
||||
A: Some ads have no payment methods listed. Store as empty list `[]`. Continue. This is valid data.
|
||||
|
||||
**Q: What timezone should I use?**
|
||||
A: **Everything in UTC.** The user is in VET (UTC-4), but all stored timestamps are UTC. Timezone conversion is only for display.
|
||||
|
||||
**Q: What if the VPS reboots?**
|
||||
A: systemd `Restart=always` handles this. The collector reads the last checkpoint and continues after the appropriate delay.
|
||||
|
||||
**Q: Should I use asyncio?**
|
||||
A: No. Simple synchronous code. The delay between requests (5 minutes) means async provides zero benefit and adds complexity.
|
||||
|
||||
**Q: Can I use SQLite instead of Parquet?**
|
||||
A: You could, but Parquet is more storage-efficient and directly loadable into ML frameworks (Pandas, Polars, PyTorch). Stick with Parquet.
|
||||
|
||||
---
|
||||
|
||||
*End of data collection spec. Hand this to the coding agent as the single source of truth.*
|
||||
36
p2p-collector/Makefile
Normal file
36
p2p-collector/Makefile
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
.PHONY: setup test run run-once clean validate
|
||||
|
||||
VENV = .venv
|
||||
PYTHON = $(VENV)/bin/python3
|
||||
PIP = $(VENV)/bin/pip
|
||||
|
||||
setup: $(VENV)/bin/activate
|
||||
|
||||
$(VENV)/bin/activate: requirements.txt
|
||||
python3 -m venv $(VENV)
|
||||
$(PIP) install --upgrade pip
|
||||
$(PIP) install -r requirements.txt
|
||||
touch $(VENV)/bin/activate
|
||||
|
||||
test: setup
|
||||
$(PYTHON) -m unittest discover -s tests -p "test_*.py"
|
||||
|
||||
run: setup
|
||||
$(PYTHON) collect_p2p.py
|
||||
|
||||
run-once: setup
|
||||
$(PYTHON) collect_p2p.py --once
|
||||
|
||||
validate: setup
|
||||
@if [ -z "$(PATH_TO_VALIDATE)" ]; then \
|
||||
echo "Usage: make validate PATH_TO_VALIDATE=<path>"; \
|
||||
exit 1; \
|
||||
fi
|
||||
$(PYTHON) collect_p2p.py --validate $(PATH_TO_VALIDATE)
|
||||
|
||||
clean:
|
||||
rm -rf $(VENV)
|
||||
find . -type f -name "*.pyc" -delete
|
||||
find . -type d -name "__pycache__" -exec rm -rf {} +
|
||||
find . -type f -name "*.tmp" -delete
|
||||
@echo "Cleanup complete."
|
||||
103
p2p-collector/README.md
Normal file
103
p2p-collector/README.md
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
# Binance P2P Data Collector
|
||||
|
||||
This tool continuously collects public peer-to-peer (P2P) market advertisements from Binance P2P for Venezuela (VES/USDT), normalizing, validating, and saving them as atomic date-partitioned Parquet files for subsequent exploratory data analysis and arbitrage modeling.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
p2p-collector/
|
||||
├── collect_p2p.py # Entry point: argument parsing, validation/daemon modes
|
||||
├── config.yaml # Application configuration (endpoints, delays, validation limits)
|
||||
├── binance_client.py # HTTP client, pagination logic, retry, and 429 backoff
|
||||
├── normalizer.py # Converts raw nested API responses into a flat 23-column schema
|
||||
├── validator.py # Row-level filtering and snapshot-level integrity checks
|
||||
├── storage.py # Atomic Parquet writes, schema references, and checkpoints
|
||||
├── scheduler.py # Loop executor, initial start offsets, signal handling
|
||||
├── alert.py # Write alert marker files on 5 consecutive failures & logger setup
|
||||
├── utils.py # Time and sleep/jitter helpers
|
||||
├── requirements.txt # Package dependencies (httpx, pandas, pyarrow, pyyaml)
|
||||
├── Makefile # Automation targets (setup, test, run, clean)
|
||||
└── tests/ # Suite of unit tests for all components
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **Python 3.8+** (Developed and tested with Python 3.14)
|
||||
- **Make** (utility for running Makefile targets)
|
||||
|
||||
## Installation & Setup
|
||||
|
||||
Set up the Python virtual environment and install all dependencies:
|
||||
|
||||
```bash
|
||||
make setup
|
||||
```
|
||||
|
||||
## Running the Collector
|
||||
|
||||
### Mode 1: Continuous Daemon Mode
|
||||
Runs indefinitely, fetching snapshots according to the configured interval (default: 5 minutes) with a ±10% sleep jitter to prevent pattern recognition. Handles graceful shutdown on SIGINT/SIGTERM.
|
||||
|
||||
```bash
|
||||
make run
|
||||
```
|
||||
|
||||
### Mode 2: One-shot Mode (Test/Debug)
|
||||
Runs exactly one cycle (one BUY snapshot and one SELL snapshot), writes the results to disk, and exits immediately:
|
||||
|
||||
```bash
|
||||
make run-once
|
||||
```
|
||||
|
||||
### Mode 3: Validate-Only Mode
|
||||
Validates existing Parquet files without making any network calls. It prints statistics (row count, min/max prices, payment methods) and checks for critical schema issues:
|
||||
|
||||
```bash
|
||||
make validate PATH_TO_VALIDATE=data/raw/buy_ads/year=2026/month=06/day=05/
|
||||
```
|
||||
|
||||
## Running Tests
|
||||
|
||||
Run the test suite to verify the client, normalizer, storage, and validation behaviors:
|
||||
|
||||
```bash
|
||||
make test
|
||||
```
|
||||
|
||||
## Output Directory Structure
|
||||
|
||||
The data is saved under `./data/` folder inside the project root:
|
||||
|
||||
```
|
||||
data/
|
||||
├── raw/
|
||||
│ ├── buy_ads/
|
||||
│ │ └── year=YYYY/month=MM/day=DD/
|
||||
│ │ ├── _schema.parquet # Empty schema reference
|
||||
│ │ └── snapshot_YYYYMMDD_HHMMSS.parquet # Atomic snapshot data
|
||||
│ └── sell_ads/
|
||||
│ └── year=YYYY/month=MM/day=DD/
|
||||
│ ├── _schema.parquet
|
||||
│ └── snapshot_YYYYMMDD_HHMMSS.parquet
|
||||
├── logs/
|
||||
│ └── collector.log # Rotating logs
|
||||
├── alerts/
|
||||
│ └── YYYYMMDD_HHMMSS_5_failures.alert # Alert marker JSON file (only on failures)
|
||||
└── checkpoint.json # Restart resilience marker
|
||||
```
|
||||
|
||||
## Checkpoint Format
|
||||
|
||||
A checkpoint file is updated on every successful snapshot, ensuring that restarting the daemon will not query the API until the expected interval has passed:
|
||||
|
||||
```json
|
||||
{
|
||||
"last_completed_snapshot": "2026-06-05T13:30:00Z",
|
||||
"last_buy_ad_count": 47,
|
||||
"last_sell_ad_count": 53,
|
||||
"consecutive_failures": 0,
|
||||
"total_snapshots": 284,
|
||||
"first_snapshot": "2026-06-01T00:00:00Z",
|
||||
"version": "1.0"
|
||||
}
|
||||
```
|
||||
80
p2p-collector/alert.py
Normal file
80
p2p-collector/alert.py
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
import os
|
||||
import json
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def setup_logging(config: dict) -> None:
|
||||
"""
|
||||
Configures the logger using parameters from the config dictionary.
|
||||
Sets up both a console handler and a rotating file handler.
|
||||
"""
|
||||
log_config = config.get("logging", {})
|
||||
level_str = log_config.get("level", "INFO")
|
||||
level = getattr(logging, level_str.upper(), logging.INFO)
|
||||
|
||||
log_file = log_config.get("file", "./data/logs/collector.log")
|
||||
max_bytes = log_config.get("max_bytes", 10485760)
|
||||
backup_count = log_config.get("backup_count", 5)
|
||||
log_format = log_config.get("format", "%(asctime)s | %(levelname)s | %(message)s")
|
||||
|
||||
# Ensure log directory exists
|
||||
log_dir = os.path.dirname(log_file)
|
||||
if log_dir:
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
|
||||
# Set up root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(level)
|
||||
|
||||
# Remove existing handlers to avoid duplicates on re-setup
|
||||
for handler in list(root_logger.handlers):
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
# Console handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter(log_format))
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# Rotating file handler
|
||||
try:
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8"
|
||||
)
|
||||
file_handler.setFormatter(logging.Formatter(log_format))
|
||||
root_logger.addHandler(file_handler)
|
||||
logger.info(f"Logging configured successfully. Writing to {log_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set up file logging: {e}")
|
||||
|
||||
|
||||
def write_alert_file(config: dict, timestamp: datetime, error_msg: str, consecutive_failures: int, traceback_str: str) -> str:
|
||||
"""
|
||||
Writes an alert marker JSON file to the configured alert directory.
|
||||
Returns the path of the created alert file.
|
||||
"""
|
||||
alerts_config = config.get("alerts", {})
|
||||
alert_dir = alerts_config.get("alert_dir", "./data/alerts")
|
||||
os.makedirs(alert_dir, exist_ok=True)
|
||||
|
||||
time_str = timestamp.strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"{time_str}_{consecutive_failures}_failures.alert"
|
||||
alert_path = os.path.join(alert_dir, filename)
|
||||
|
||||
content = {
|
||||
"timestamp": timestamp.isoformat(),
|
||||
"error": error_msg,
|
||||
"consecutive_failures": consecutive_failures,
|
||||
"traceback": traceback_str
|
||||
}
|
||||
|
||||
try:
|
||||
with open(alert_path, "w", encoding="utf-8") as f:
|
||||
json.dump(content, f, indent=4)
|
||||
logger.error(f"ALERT WRITTEN: {alert_path}")
|
||||
return alert_path
|
||||
except Exception as e:
|
||||
logger.critical(f"Failed to write alert file at {alert_path}: {e}")
|
||||
return ""
|
||||
144
p2p-collector/binance_client.py
Normal file
144
p2p-collector/binance_client.py
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
import time
|
||||
import logging
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BinanceP2PError(Exception):
|
||||
"""Base exception for Binance P2P client operations."""
|
||||
pass
|
||||
|
||||
class RateLimitError(BinanceP2PError):
|
||||
"""Raised when Binance P2P API returns HTTP 429 (Rate Limited)."""
|
||||
pass
|
||||
|
||||
class APIError(BinanceP2PError):
|
||||
"""Raised when the API returns a response with success=false or invalid structure."""
|
||||
pass
|
||||
|
||||
class BinanceP2PClient:
|
||||
def __init__(self, config: dict):
|
||||
binance_cfg = config.get("binance", {})
|
||||
self.base_url = binance_cfg.get("base_url", "https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search")
|
||||
self.user_agent = binance_cfg.get("user_agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
|
||||
self.timeout = binance_cfg.get("timeout_seconds", 15)
|
||||
self.max_pages = binance_cfg.get("max_pages", 10)
|
||||
self.page_delay = binance_cfg.get("request_delay_seconds", 0.5)
|
||||
|
||||
self.headers = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": self.user_agent,
|
||||
"Accept": "*/*",
|
||||
"Origin": "https://p2p.binance.com",
|
||||
"Referer": "https://p2p.binance.com/"
|
||||
}
|
||||
|
||||
# Retry config
|
||||
collection_cfg = config.get("collection", {})
|
||||
self.max_retries = collection_cfg.get("retry_attempts", 3)
|
||||
self.retry_delay_base = collection_cfg.get("retry_delay_base_seconds", 10)
|
||||
|
||||
# 429 Rate Limiting State
|
||||
self.current_429_backoff = 60
|
||||
self.had_429_this_cycle = False
|
||||
|
||||
def reset_429_backoff(self):
|
||||
"""Resets the 429 backoff delay to its initial value (60s)."""
|
||||
if self.current_429_backoff != 60:
|
||||
logger.info("Resetting 429 rate limit backoff to 60s.")
|
||||
self.current_429_backoff = 60
|
||||
|
||||
def double_429_backoff(self):
|
||||
"""Doubles the 429 backoff delay, capping at 480s."""
|
||||
self.had_429_this_cycle = True
|
||||
logger.warning(f"Rate limited (429). Setting next backoff delay to {self.current_429_backoff}s.")
|
||||
time.sleep(self.current_429_backoff)
|
||||
self.current_429_backoff = min(self.current_429_backoff * 2, 480)
|
||||
|
||||
def _post_request_with_retries(self, body: dict) -> dict:
|
||||
"""
|
||||
Executes a POST request to the Binance P2P API with retries for connection
|
||||
and 5xx errors, and special handling for 429 Rate Limits.
|
||||
"""
|
||||
for attempt in range(1, self.max_retries + 1):
|
||||
try:
|
||||
# Use httpx.Client for synchronous calls
|
||||
with httpx.Client(headers=self.headers, timeout=self.timeout) as client:
|
||||
resp = client.post(self.base_url, json=body)
|
||||
|
||||
# Handle 429 specifically
|
||||
if resp.status_code == 429:
|
||||
self.double_429_backoff()
|
||||
raise RateLimitError("HTTP 429 Rate Limited by Binance.")
|
||||
|
||||
# Handle 5xx server errors
|
||||
if 500 <= resp.status_code < 600:
|
||||
logger.warning(f"Binance P2P API returned HTTP {resp.status_code} (attempt {attempt}/{self.max_retries}). Retrying in 60s...")
|
||||
time.sleep(60)
|
||||
continue
|
||||
|
||||
# Raise for other HTTP errors (4xx except 429)
|
||||
resp.raise_for_status()
|
||||
|
||||
# Parse JSON
|
||||
data = resp.json()
|
||||
if not data.get("success"):
|
||||
raise APIError(f"API response success=false: {data}")
|
||||
|
||||
return data
|
||||
|
||||
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout) as e:
|
||||
logger.warning(f"Connection error occurred: {e} (attempt {attempt}/{self.max_retries}). Retrying in 30s...")
|
||||
if attempt < self.max_retries:
|
||||
time.sleep(30)
|
||||
else:
|
||||
raise BinanceP2PError(f"Failed to connect after {self.max_retries} attempts: {e}")
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"HTTP Error {e.response.status_code}: {e.response.text}")
|
||||
raise BinanceP2PError(f"HTTP Status Error: {e}")
|
||||
|
||||
raise BinanceP2PError("Failed to fetch P2P ads after maximum retries.")
|
||||
|
||||
def fetch_all_ads(self, trade_type: str, asset: str, fiat: str) -> list:
|
||||
"""
|
||||
Fetches all P2P advertisements for a given trade type, asset, and fiat,
|
||||
handling pagination and page-level delays.
|
||||
"""
|
||||
all_ads = []
|
||||
|
||||
for page in range(1, self.max_pages + 1):
|
||||
body = {
|
||||
"asset": asset,
|
||||
"fiat": fiat,
|
||||
"tradeType": trade_type,
|
||||
"page": page,
|
||||
"rows": 20,
|
||||
"payTypes": [],
|
||||
"countries": [],
|
||||
"publisherType": None,
|
||||
"classify": "personal",
|
||||
"filter": {}
|
||||
}
|
||||
|
||||
logger.info(f"Fetching {trade_type} page {page}/{self.max_pages} for {asset}/{fiat}...")
|
||||
data = self._post_request_with_retries(body)
|
||||
|
||||
ads = data.get("data", [])
|
||||
total = data.get("total", 0)
|
||||
|
||||
all_ads.extend(ads)
|
||||
logger.info(f"Retrieved {len(ads)} ads. Total collected so far: {len(all_ads)}/{total}")
|
||||
|
||||
# Stop if we've collected all available ads
|
||||
if len(all_ads) >= total:
|
||||
break
|
||||
|
||||
# Don't request a page that starts beyond total ads
|
||||
if page * 20 >= total:
|
||||
break
|
||||
|
||||
# Delay between pages
|
||||
if page < self.max_pages:
|
||||
time.sleep(self.page_delay)
|
||||
|
||||
return all_ads
|
||||
176
p2p-collector/collect_p2p.py
Normal file
176
p2p-collector/collect_p2p.py
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
import os
|
||||
import argparse
|
||||
import sys
|
||||
import logging
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
from alert import setup_logging
|
||||
from scheduler import P2PCollectorScheduler
|
||||
|
||||
logger = logging.getLogger("collect_p2p")
|
||||
|
||||
def load_config(config_path: str) -> dict:
|
||||
"""Loads the YAML configuration file."""
|
||||
if not os.path.exists(config_path):
|
||||
print(f"Error: Config file not found at {config_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f"Error parsing config file {config_path}: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def validate_parquet_files(path_str: str):
|
||||
"""
|
||||
Validates existing Parquet files at the specified path (file or directory).
|
||||
Prints a report of the contents and validation status.
|
||||
"""
|
||||
path = Path(path_str)
|
||||
if not path.exists():
|
||||
print(f"Error: Path does not exist: {path_str}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
files = []
|
||||
if path.is_file():
|
||||
if path.suffix == ".parquet":
|
||||
files.append(path)
|
||||
elif path.is_dir():
|
||||
files = list(path.glob("**/*.parquet"))
|
||||
|
||||
if not files:
|
||||
print(f"No Parquet files found at {path_str}")
|
||||
return
|
||||
|
||||
print(f"Validating {len(files)} Parquet file(s)...")
|
||||
print("-" * 60)
|
||||
|
||||
total_rows = 0
|
||||
total_buy = 0
|
||||
total_sell = 0
|
||||
critical_errors = []
|
||||
warnings = []
|
||||
|
||||
expected_columns = {
|
||||
"snapshot_id", "fetched_at", "fetched_date", "trade_type", "adv_no",
|
||||
"asset", "fiat", "price", "surplus_amount", "min_amount", "max_amount",
|
||||
"tradable_quantity", "advertiser_no", "advertiser_name", "advertiser_type",
|
||||
"month_order_count", "month_finish_rate", "positive_rate", "user_positive_rate",
|
||||
"payment_methods", "payment_method_ids", "ad_created_at", "price_type"
|
||||
}
|
||||
|
||||
for f in sorted(files):
|
||||
# Skip schema files unless they are specifically targeted
|
||||
if f.name == "_schema.parquet" and len(files) > 1:
|
||||
continue
|
||||
|
||||
try:
|
||||
df = pd.read_parquet(f)
|
||||
rows = len(df)
|
||||
total_rows += rows
|
||||
|
||||
print(f"File: {f.name} ({rows} rows)")
|
||||
|
||||
# Check columns
|
||||
cols = set(df.columns)
|
||||
missing_cols = expected_columns - cols
|
||||
extra_cols = cols - expected_columns
|
||||
if missing_cols:
|
||||
critical_errors.append(f"{f.name}: Missing expected columns: {missing_cols}")
|
||||
if extra_cols:
|
||||
warnings.append(f"{f.name}: Has extra columns: {extra_cols}")
|
||||
|
||||
# Analyze trade types
|
||||
if "trade_type" in df.columns:
|
||||
buy_cnt = (df["trade_type"] == "BUY").sum()
|
||||
sell_cnt = (df["trade_type"] == "SELL").sum()
|
||||
total_buy += buy_cnt
|
||||
total_sell += sell_cnt
|
||||
print(f" Trade types: BUY={buy_cnt}, SELL={sell_cnt}")
|
||||
|
||||
# Check critical nulls
|
||||
critical_fields = ["price", "adv_no", "advertiser_no"]
|
||||
for col in critical_fields:
|
||||
if col in df.columns:
|
||||
null_cnt = df[col].isnull().sum()
|
||||
if null_cnt > 0:
|
||||
critical_errors.append(f"{f.name}: Column '{col}' has {null_cnt} null values")
|
||||
|
||||
# Check types
|
||||
if "price" in df.columns and not pd.api.types.is_float_dtype(df["price"]):
|
||||
critical_errors.append(f"{f.name}: Column 'price' is not float type")
|
||||
|
||||
# Prices summary
|
||||
if "price" in df.columns and rows > 0:
|
||||
print(f" Price range: [{df['price'].min():.2f} - {df['price'].max():.2f}]")
|
||||
|
||||
# Payment methods
|
||||
if "payment_methods" in df.columns and rows > 0:
|
||||
methods = set()
|
||||
# payment_methods could be list of lists/arrays
|
||||
for item in df["payment_methods"]:
|
||||
if isinstance(item, str):
|
||||
methods.add(item)
|
||||
elif hasattr(item, "__iter__"):
|
||||
methods.update(item)
|
||||
print(f" Payment methods ({len(methods)}): {sorted(list(methods))}")
|
||||
|
||||
except Exception as e:
|
||||
critical_errors.append(f"{f.name}: Failed to read/validate: {e}")
|
||||
|
||||
print("=" * 60)
|
||||
print("VALIDATION SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Total files validated: {len(files)}")
|
||||
print(f"Total rows: {total_rows}")
|
||||
print(f"Total BUY ads: {total_buy}")
|
||||
print(f"Total SELL ads: {total_sell}")
|
||||
|
||||
print("\nWarnings:")
|
||||
if warnings:
|
||||
for w in warnings:
|
||||
print(f" - {w}")
|
||||
else:
|
||||
print(" None")
|
||||
|
||||
print("\nCritical Errors:")
|
||||
if critical_errors:
|
||||
for err in critical_errors:
|
||||
print(f" - [FAIL] {err}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(" [PASS] No critical validation issues found!")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Binance P2P Data Collector")
|
||||
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml file")
|
||||
parser.add_argument("--once", action="store_true", help="Run a single collection cycle and exit")
|
||||
parser.add_argument("--validate", help="Path to Parquet file or directory to validate")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# If validate-only mode requested
|
||||
if args.validate:
|
||||
validate_parquet_files(args.validate)
|
||||
return
|
||||
|
||||
# Load configuration
|
||||
config = load_config(args.config)
|
||||
|
||||
# Setup logging
|
||||
setup_logging(config)
|
||||
|
||||
# Initialize scheduler
|
||||
scheduler = P2PCollectorScheduler(config)
|
||||
|
||||
# Run scheduler
|
||||
scheduler.run(once=args.once)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
32
p2p-collector/config.yaml
Normal file
32
p2p-collector/config.yaml
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
binance:
|
||||
base_url: "https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search"
|
||||
user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
timeout_seconds: 15
|
||||
max_pages: 10
|
||||
request_delay_seconds: 0.5
|
||||
|
||||
collection:
|
||||
pairs:
|
||||
- asset: "USDT"
|
||||
fiat: "VES"
|
||||
interval_seconds: 300
|
||||
output_dir: "./data/raw"
|
||||
retry_attempts: 3
|
||||
retry_delay_base_seconds: 10
|
||||
|
||||
validation:
|
||||
price_min: 1.0
|
||||
price_max: 2000.0
|
||||
reject_zero_finish_rate: true
|
||||
reject_zero_surplus: true
|
||||
|
||||
logging:
|
||||
level: "INFO"
|
||||
file: "./data/logs/collector.log"
|
||||
max_bytes: 10485760 # 10 MB
|
||||
backup_count: 5
|
||||
format: "%(asctime)s | %(levelname)s | %(message)s"
|
||||
|
||||
alerts:
|
||||
consecutive_failure_threshold: 5
|
||||
alert_dir: "./data/alerts"
|
||||
99
p2p-collector/normalizer.py
Normal file
99
p2p-collector/normalizer.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def normalize_ad(raw_ad: dict, trade_type: str, fetched_at: datetime) -> dict:
|
||||
"""
|
||||
Normalizes a single P2P ad dictionary from the Binance API response
|
||||
into a flattened dictionary schema matching the spec.
|
||||
"""
|
||||
adv = raw_ad.get("adv", {})
|
||||
adver = raw_ad.get("advertiser", {})
|
||||
|
||||
# Check if critical structures are missing
|
||||
if not adv:
|
||||
logger.warning("Ad structure 'adv' is missing in raw ad data.")
|
||||
if not adver:
|
||||
logger.warning("Advertiser structure 'advertiser' is missing in raw ad data.")
|
||||
|
||||
# Extract payment methods
|
||||
trade_methods = adv.get("tradeMethods") or []
|
||||
payment_methods = []
|
||||
payment_method_ids = []
|
||||
for m in trade_methods:
|
||||
if isinstance(m, dict):
|
||||
pay_type = m.get("payType")
|
||||
identifier = m.get("identifier")
|
||||
if pay_type:
|
||||
payment_methods.append(pay_type)
|
||||
if identifier:
|
||||
payment_method_ids.append(identifier)
|
||||
|
||||
# Safe float conversion helper
|
||||
def safe_float(val, default=0.0, field_name=None):
|
||||
if val is None:
|
||||
return default
|
||||
try:
|
||||
return float(val)
|
||||
except (ValueError, TypeError) as e:
|
||||
if field_name:
|
||||
logger.warning(f"Could not convert field '{field_name}' value {val!r} to float: {e}")
|
||||
return default
|
||||
|
||||
# Safe int conversion helper
|
||||
def safe_int(val, default=0, field_name=None):
|
||||
if val is None:
|
||||
return default
|
||||
try:
|
||||
return int(val)
|
||||
except (ValueError, TypeError) as e:
|
||||
if field_name:
|
||||
logger.warning(f"Could not convert field '{field_name}' value {val!r} to int: {e}")
|
||||
return default
|
||||
|
||||
# Convert createTime (milliseconds since epoch) to datetime
|
||||
create_time_ms = adv.get("createTime")
|
||||
if create_time_ms is not None:
|
||||
try:
|
||||
ad_created_at = datetime.fromtimestamp(safe_float(create_time_ms) / 1000.0, tz=timezone.utc)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not parse ad createTime {create_time_ms}: {e}")
|
||||
ad_created_at = fetched_at
|
||||
else:
|
||||
ad_created_at = fetched_at
|
||||
|
||||
# Check for missing expected fields to log warnings, but don't fail
|
||||
required_keys = ["advNo", "asset", "fiatUnit", "price"]
|
||||
for key in required_keys:
|
||||
if key not in adv:
|
||||
logger.warning(f"Expected key '{key}' not found in 'adv' structure of ad: {raw_ad}")
|
||||
|
||||
if "userNo" not in adver:
|
||||
logger.warning(f"Expected key 'userNo' not found in 'advertiser' structure of ad: {raw_ad}")
|
||||
|
||||
return {
|
||||
"snapshot_id": f"{fetched_at.strftime('%Y%m%dT%H%M%SZ')}_{trade_type}",
|
||||
"fetched_at": fetched_at,
|
||||
"fetched_date": fetched_at.strftime("%Y-%m-%d"),
|
||||
"trade_type": trade_type,
|
||||
"adv_no": adv.get("advNo", ""),
|
||||
"asset": adv.get("asset", "USDT"),
|
||||
"fiat": adv.get("fiatUnit", "VES"),
|
||||
"price": safe_float(adv.get("price"), 0.0, "price"),
|
||||
"surplus_amount": safe_float(adv.get("surplusAmount"), 0.0, "surplusAmount"),
|
||||
"min_amount": safe_float(adv.get("minSingleTransAmount"), 0.0, "minSingleTransAmount"),
|
||||
"max_amount": safe_float(adv.get("maxSingleTransAmount"), 0.0, "maxSingleTransAmount"),
|
||||
"tradable_quantity": safe_float(adv.get("tradableQuantity"), 0.0, "tradableQuantity"),
|
||||
"advertiser_no": adver.get("userNo", ""),
|
||||
"advertiser_name": adver.get("nickName", ""),
|
||||
"advertiser_type": adver.get("userType", "user"),
|
||||
"month_order_count": safe_int(adver.get("monthOrderCount"), 0, "monthOrderCount"),
|
||||
"month_finish_rate": safe_float(adver.get("monthFinishRate"), 0.0, "monthFinishRate"),
|
||||
"positive_rate": safe_float(adver.get("positiveRate"), 0.0, "positiveRate"),
|
||||
"user_positive_rate": safe_float(adver.get("userPositiveRate"), 0.0, "userPositiveRate"),
|
||||
"payment_methods": payment_methods,
|
||||
"payment_method_ids": payment_method_ids,
|
||||
"ad_created_at": ad_created_at,
|
||||
"price_type": adv.get("priceType", "FIXED"),
|
||||
}
|
||||
4
p2p-collector/requirements.txt
Normal file
4
p2p-collector/requirements.txt
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
httpx>=0.27,<1.0
|
||||
pandas>=2.0,<3.0
|
||||
pyarrow>=14.0
|
||||
pyyaml>=6.0,<7.0
|
||||
44
p2p-collector/sample_responses/response_buy.json
Normal file
44
p2p-collector/sample_responses/response_buy.json
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"data": [
|
||||
{
|
||||
"adv": {
|
||||
"advNo": "6f8b2e12345",
|
||||
"tradeType": "BUY",
|
||||
"asset": "USDT",
|
||||
"fiatUnit": "VES",
|
||||
"price": "58.50",
|
||||
"surplusAmount": "1520.43",
|
||||
"maxSingleTransAmount": "5000.00",
|
||||
"minSingleTransAmount": "100.00",
|
||||
"tradableQuantity": "1520.43",
|
||||
"createTime": 1749128400000,
|
||||
"fiatSymbol": "Bs",
|
||||
"priceType": "FIXED",
|
||||
"tradeMethods": [
|
||||
{
|
||||
"identifier": "Banco_Banesco",
|
||||
"payType": "BANESCO",
|
||||
"payMethodId": "BANESCO"
|
||||
},
|
||||
{
|
||||
"identifier": "Pago_Movil",
|
||||
"payType": "PAGO_MOVIL",
|
||||
"payMethodId": "PAGO_MOVIL"
|
||||
}
|
||||
]
|
||||
},
|
||||
"advertiser": {
|
||||
"userNo": "ABC123456",
|
||||
"nickName": "CryptoTraderVE",
|
||||
"userType": "merchant",
|
||||
"monthOrderCount": 342,
|
||||
"monthFinishRate": 0.97,
|
||||
"positiveRate": 0.99,
|
||||
"userPositiveRate": 0.99
|
||||
}
|
||||
}
|
||||
],
|
||||
"total": 1,
|
||||
"pageSize": 20,
|
||||
"success": true
|
||||
}
|
||||
44
p2p-collector/sample_responses/response_sell.json
Normal file
44
p2p-collector/sample_responses/response_sell.json
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"data": [
|
||||
{
|
||||
"adv": {
|
||||
"advNo": "7a9c3d98765",
|
||||
"tradeType": "SELL",
|
||||
"asset": "USDT",
|
||||
"fiatUnit": "VES",
|
||||
"price": "62.30",
|
||||
"surplusAmount": "2500.00",
|
||||
"maxSingleTransAmount": "10000.00",
|
||||
"minSingleTransAmount": "500.00",
|
||||
"tradableQuantity": "2500.00",
|
||||
"createTime": 1749129000000,
|
||||
"fiatSymbol": "Bs",
|
||||
"priceType": "FIXED",
|
||||
"tradeMethods": [
|
||||
{
|
||||
"identifier": "Banco_Mercantil",
|
||||
"payType": "MERCANTIL",
|
||||
"payMethodId": "MERCANTIL"
|
||||
},
|
||||
{
|
||||
"identifier": "Pago_Movil",
|
||||
"payType": "PAGO_MOVIL",
|
||||
"payMethodId": "PAGO_MOVIL"
|
||||
}
|
||||
]
|
||||
},
|
||||
"advertiser": {
|
||||
"userNo": "XYZ789012",
|
||||
"nickName": "MercantilSeller",
|
||||
"userType": "merchant",
|
||||
"monthOrderCount": 512,
|
||||
"monthFinishRate": 0.99,
|
||||
"positiveRate": 0.98,
|
||||
"userPositiveRate": 0.98
|
||||
}
|
||||
}
|
||||
],
|
||||
"total": 1,
|
||||
"pageSize": 20,
|
||||
"success": true
|
||||
}
|
||||
240
p2p-collector/scheduler.py
Normal file
240
p2p-collector/scheduler.py
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
import os
|
||||
import time
|
||||
import signal
|
||||
import logging
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from binance_client import BinanceP2PClient, BinanceP2PError
|
||||
from normalizer import normalize_ad
|
||||
from validator import validate_row, validate_snapshot
|
||||
from storage import store_parquet, read_checkpoint, write_checkpoint
|
||||
from alert import write_alert_file
|
||||
from utils import jitter, now_utc
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class P2PCollectorScheduler:
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
self.running = True
|
||||
|
||||
# Configure directories
|
||||
collection_cfg = config.get("collection", {})
|
||||
self.output_dir = collection_cfg.get("output_dir", "./data/raw")
|
||||
self.interval = collection_cfg.get("interval_seconds", 300)
|
||||
self.pairs = collection_cfg.get("pairs", [{"asset": "USDT", "fiat": "VES"}])
|
||||
|
||||
# Checkpoint path: in the parent folder of output_dir (or same directory if raw)
|
||||
# Spec says:
|
||||
# data/
|
||||
# ├── raw/
|
||||
# └── checkpoint.json
|
||||
# So we look at parent of output_dir
|
||||
self.data_dir = str(Path(self.output_dir).parent)
|
||||
self.checkpoint_path = os.path.join(self.data_dir, "checkpoint.json")
|
||||
|
||||
self.client = BinanceP2PClient(config)
|
||||
self.consecutive_failures = 0
|
||||
|
||||
# Signal handlers
|
||||
signal.signal(signal.SIGINT, self._handle_signal)
|
||||
signal.signal(signal.SIGTERM, self._handle_signal)
|
||||
|
||||
def _handle_signal(self, sig, frame):
|
||||
logger.info(f"Received signal {sig}. Initiating graceful shutdown after current snapshot completes...")
|
||||
self.running = False
|
||||
|
||||
def verify_directories(self):
|
||||
"""Verifies that the output directory is writable by writing a test file and deleting it."""
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
test_file = os.path.join(self.output_dir, ".write_test")
|
||||
try:
|
||||
with open(test_file, "w") as f:
|
||||
f.write("test")
|
||||
os.remove(test_file)
|
||||
logger.info(f"Directory write verification passed for {self.output_dir}")
|
||||
except Exception as e:
|
||||
logger.critical(f"Directory verification failed on {self.output_dir}: {e}")
|
||||
raise OSError(f"Output directory not writable: {e}")
|
||||
|
||||
def get_initial_wait_seconds(self) -> float:
|
||||
"""Reads checkpoint to determine how long to wait before starting the loop."""
|
||||
checkpoint = read_checkpoint(self.checkpoint_path)
|
||||
last_completed = checkpoint.get("last_completed_snapshot")
|
||||
if not last_completed:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
last_dt = datetime.fromisoformat(last_completed)
|
||||
# Make sure timezone aware UTC
|
||||
if last_dt.tzinfo is None:
|
||||
last_dt = last_dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
elapsed = (now_utc() - last_dt).total_seconds()
|
||||
wait_time = self.interval - elapsed
|
||||
if wait_time > 0:
|
||||
logger.info(f"Resuming. Last snapshot completed {elapsed:.1f}s ago. Initial wait time: {wait_time:.1f}s.")
|
||||
return wait_time
|
||||
except Exception as e:
|
||||
logger.warning(f"Error parsing last snapshot time from checkpoint: {e}. Starting immediately.")
|
||||
|
||||
return 0.0
|
||||
|
||||
def run_single_cycle(self) -> dict:
|
||||
"""Runs a single snapshot collection cycle for all pairs."""
|
||||
cycle_start_time = now_utc()
|
||||
cycle_stats = {}
|
||||
|
||||
# We process each pair configured
|
||||
for pair in self.pairs:
|
||||
asset = pair.get("asset", "USDT")
|
||||
fiat = pair.get("fiat", "VES")
|
||||
|
||||
start_ts = time.time()
|
||||
|
||||
# 1. Fetch raw advertisements
|
||||
# Delay between trade types (snapshots) is 1 second
|
||||
buy_raw = self.client.fetch_all_ads("BUY", asset, fiat)
|
||||
time.sleep(1.0)
|
||||
sell_raw = self.client.fetch_all_ads("SELL", asset, fiat)
|
||||
|
||||
# 2. Normalize and Filter individual rows
|
||||
seen_adv_nos = set()
|
||||
flat_buy = []
|
||||
for ad in buy_raw:
|
||||
norm = normalize_ad(ad, "BUY", cycle_start_time)
|
||||
if validate_row(norm, self.config, seen_adv_nos):
|
||||
flat_buy.append(norm)
|
||||
|
||||
flat_sell = []
|
||||
for ad in sell_raw:
|
||||
norm = normalize_ad(ad, "SELL", cycle_start_time)
|
||||
if validate_row(norm, self.config, seen_adv_nos):
|
||||
flat_sell.append(norm)
|
||||
|
||||
# 3. Snapshot-level Validation
|
||||
combined_ads = flat_buy + flat_sell
|
||||
validation_summary = validate_snapshot(combined_ads, cycle_start_time)
|
||||
|
||||
# 4. Storage (atomic writes to raw/buy_ads and raw/sell_ads)
|
||||
buy_path = store_parquet(flat_buy, os.path.join(self.output_dir, "buy_ads"), cycle_start_time)
|
||||
sell_path = store_parquet(flat_sell, os.path.join(self.output_dir, "sell_ads"), cycle_start_time)
|
||||
|
||||
elapsed = time.time() - start_ts
|
||||
|
||||
# 5. Snapshot Summary Log Line
|
||||
# Format: 2026-06-05 13:30:00 UTC | BUY=47 ads [54.20–62.80] SELL=53 ads [58.00–68.50] | spread= -4.80 | took 3.2s | methods=[BANESCO,PAGO_MOVIL,MERCANTIL,...]
|
||||
methods_str = ",".join(validation_summary["methods"])
|
||||
logger.info(
|
||||
f"{cycle_start_time.strftime('%Y-%m-%d %H:%M:%S')} UTC | "
|
||||
f"BUY={validation_summary['buy_count']} ads [{validation_summary['buy_min']:.2f}-{validation_summary['buy_max']:.2f}] "
|
||||
f"SELL={validation_summary['sell_count']} ads [{validation_summary['sell_min']:.2f}-{validation_summary['sell_max']:.2f}] | "
|
||||
f"spread={validation_summary['spread']:.2f} | took {elapsed:.1f}s | "
|
||||
f"methods=[{methods_str}]"
|
||||
)
|
||||
|
||||
cycle_stats[f"{asset}_{fiat}"] = {
|
||||
"buy_count": validation_summary['buy_count'],
|
||||
"sell_count": validation_summary['sell_count'],
|
||||
"timestamp": cycle_start_time.isoformat()
|
||||
}
|
||||
|
||||
return cycle_stats
|
||||
|
||||
def run(self, once: bool = False):
|
||||
"""Starts the main execution loop."""
|
||||
self.verify_directories()
|
||||
|
||||
if once:
|
||||
logger.info("Executing a single collection cycle (--once)...")
|
||||
try:
|
||||
self.run_single_cycle()
|
||||
logger.info("One-shot collection complete. Exiting.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred during one-shot collection: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
raise e
|
||||
return
|
||||
|
||||
# Continuous loop startup
|
||||
initial_wait = self.get_initial_wait_seconds()
|
||||
if initial_wait > 0 and self.running:
|
||||
logger.info(f"Sleeping for initial delay of {initial_wait:.1f}s...")
|
||||
# Sleep in small steps to remain responsive to signals
|
||||
step = 1.0
|
||||
while initial_wait > 0 and self.running:
|
||||
time.sleep(min(step, initial_wait))
|
||||
initial_wait -= step
|
||||
|
||||
logger.info(f"Starting P2P data collector. Interval: {self.interval}s. Pairs: {self.pairs}")
|
||||
|
||||
while self.running:
|
||||
cycle_start = now_utc()
|
||||
try:
|
||||
stats = self.run_single_cycle()
|
||||
|
||||
# Reset failure stats on success
|
||||
self.consecutive_failures = 0
|
||||
self.client.reset_429_backoff()
|
||||
|
||||
# Update checkpoint
|
||||
checkpoint = read_checkpoint(self.checkpoint_path)
|
||||
|
||||
# Get stats for the primary pair (USDT_VES or first pair)
|
||||
primary_pair_key = f"{self.pairs[0]['asset']}_{self.pairs[0]['fiat']}"
|
||||
pair_stats = stats.get(primary_pair_key, {})
|
||||
|
||||
# Update stats in checkpoint
|
||||
total_snapshots = checkpoint.get("total_snapshots", 0) + 1
|
||||
first_snapshot = checkpoint.get("first_snapshot", cycle_start.isoformat())
|
||||
|
||||
checkpoint_data = {
|
||||
"last_completed_snapshot": cycle_start.isoformat(),
|
||||
"last_buy_ad_count": pair_stats.get("buy_count", 0),
|
||||
"last_sell_ad_count": pair_stats.get("sell_count", 0),
|
||||
"consecutive_failures": 0,
|
||||
"total_snapshots": total_snapshots,
|
||||
"first_snapshot": first_snapshot,
|
||||
"version": "1.0"
|
||||
}
|
||||
write_checkpoint(self.checkpoint_path, checkpoint_data)
|
||||
|
||||
except Exception as e:
|
||||
self.consecutive_failures += 1
|
||||
tb_str = traceback.format_exc()
|
||||
logger.error(f"Error during collection cycle (consecutive failures: {self.consecutive_failures}): {e}")
|
||||
logger.error(tb_str)
|
||||
|
||||
# Write alert file after threshold
|
||||
alert_threshold = self.config.get("alerts", {}).get("consecutive_failure_threshold", 5)
|
||||
if self.consecutive_failures >= alert_threshold:
|
||||
try:
|
||||
write_alert_file(
|
||||
self.config,
|
||||
cycle_start,
|
||||
str(e),
|
||||
self.consecutive_failures,
|
||||
tb_str
|
||||
)
|
||||
except Exception as alert_error:
|
||||
logger.critical(f"Failed to write alert file: {alert_error}")
|
||||
|
||||
# Update checkpoint with failure status
|
||||
checkpoint = read_checkpoint(self.checkpoint_path)
|
||||
checkpoint["consecutive_failures"] = self.consecutive_failures
|
||||
write_checkpoint(self.checkpoint_path, checkpoint)
|
||||
|
||||
# Determine sleep duration
|
||||
if self.running:
|
||||
sleep_sec = jitter(self.interval)
|
||||
logger.info(f"Sleeping for {sleep_sec:.1f}s before next cycle...")
|
||||
|
||||
# Sleep in small steps to handle signal termination cleanly
|
||||
step = 1.0
|
||||
while sleep_sec > 0 and self.running:
|
||||
time.sleep(min(step, sleep_sec))
|
||||
sleep_sec -= step
|
||||
|
||||
logger.info("Collector has shut down gracefully.")
|
||||
102
p2p-collector/storage.py
Normal file
102
p2p-collector/storage.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import os
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def store_parquet(rows: list, base_dir: str, fetched_at: datetime) -> str:
|
||||
"""
|
||||
Stores rows of ads to a Parquet file atomically in a date-partitioned directory.
|
||||
Also writes an empty _schema.parquet file for the schema reference if it doesn't exist.
|
||||
Returns the path to the written final Parquet file.
|
||||
"""
|
||||
if not rows:
|
||||
logger.warning("No rows provided to store_parquet.")
|
||||
return ""
|
||||
|
||||
year = fetched_at.strftime("%Y")
|
||||
month = fetched_at.strftime("%m")
|
||||
day = fetched_at.strftime("%d")
|
||||
filename = f"snapshot_{fetched_at.strftime('%Y%m%d_%H%M%S')}.parquet"
|
||||
|
||||
dest_dir = Path(base_dir) / f"year={year}" / f"month={month}" / f"day={day}"
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tmp_path = dest_dir / (filename + ".tmp")
|
||||
final_path = dest_dir / filename
|
||||
|
||||
try:
|
||||
df = pd.DataFrame(rows)
|
||||
# Sort columns to ensure consistent schema layout
|
||||
df = df.reindex(sorted(df.columns), axis=1)
|
||||
|
||||
# Write atomically using a temporary file
|
||||
df.to_parquet(tmp_path, index=False, engine="pyarrow")
|
||||
tmp_path.rename(final_path)
|
||||
logger.info(f"Successfully stored snapshot to {final_path}")
|
||||
|
||||
# Schema consistency reference file
|
||||
schema_path = dest_dir / "_schema.parquet"
|
||||
if not schema_path.exists():
|
||||
try:
|
||||
# Write an empty dataframe with identical columns and schema
|
||||
df.iloc[:0].to_parquet(schema_path, index=False, engine="pyarrow")
|
||||
logger.info(f"Created schema reference file at {schema_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create schema reference file: {e}")
|
||||
|
||||
return str(final_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write parquet file {final_path}: {e}")
|
||||
# Clean up tmp file if it exists
|
||||
if tmp_path.exists():
|
||||
try:
|
||||
tmp_path.unlink()
|
||||
except Exception as cleanup_error:
|
||||
logger.error(f"Failed to delete temp file {tmp_path}: {cleanup_error}")
|
||||
raise e
|
||||
|
||||
|
||||
def read_checkpoint(path: str) -> dict:
|
||||
"""
|
||||
Reads the checkpoint JSON file if it exists.
|
||||
Returns a dictionary, or an empty dict if the file is missing or corrupted.
|
||||
"""
|
||||
if not os.path.exists(path):
|
||||
logger.info(f"Checkpoint file {path} not found. Starting fresh.")
|
||||
return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
logger.info(f"Successfully read checkpoint from {path}")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning(f"Checkpoint file {path} exists but is corrupted: {e}. Starting fresh.")
|
||||
return {}
|
||||
|
||||
|
||||
def write_checkpoint(path: str, data: dict) -> None:
|
||||
"""
|
||||
Writes the checkpoint dictionary to the specified JSON file path.
|
||||
Uses atomic write to prevent corruption.
|
||||
"""
|
||||
dir_path = os.path.dirname(path)
|
||||
if dir_path:
|
||||
os.makedirs(dir_path, exist_ok=True)
|
||||
|
||||
tmp_path = f"{path}.tmp"
|
||||
try:
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=4)
|
||||
os.replace(tmp_path, path)
|
||||
logger.debug(f"Checkpoint updated at {path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write checkpoint to {path}: {e}")
|
||||
if os.path.exists(tmp_path):
|
||||
try:
|
||||
os.remove(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
107
p2p-collector/tests/test_normalizer.py
Normal file
107
p2p-collector/tests/test_normalizer.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
import unittest
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
|
||||
# Add the parent folder to path to import normalizer
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from normalizer import normalize_ad
|
||||
|
||||
class TestNormalizer(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# Paths to sample responses
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
self.buy_json_path = os.path.join(current_dir, "..", "sample_responses", "response_buy.json")
|
||||
self.sell_json_path = os.path.join(current_dir, "..", "sample_responses", "response_sell.json")
|
||||
|
||||
def test_normalize_buy_ad(self):
|
||||
with open(self.buy_json_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
raw_ad = data["data"][0]
|
||||
fetched_at = datetime(2026, 6, 5, 13, 30, 0, tzinfo=timezone.utc)
|
||||
|
||||
normalized = normalize_ad(raw_ad, "BUY", fetched_at)
|
||||
|
||||
# Verify schema keys
|
||||
expected_keys = {
|
||||
"snapshot_id", "fetched_at", "fetched_date", "trade_type", "adv_no",
|
||||
"asset", "fiat", "price", "surplus_amount", "min_amount", "max_amount",
|
||||
"tradable_quantity", "advertiser_no", "advertiser_name", "advertiser_type",
|
||||
"month_order_count", "month_finish_rate", "positive_rate", "user_positive_rate",
|
||||
"payment_methods", "payment_method_ids", "ad_created_at", "price_type"
|
||||
}
|
||||
self.assertEqual(set(normalized.keys()), expected_keys)
|
||||
|
||||
# Verify content mapping
|
||||
self.assertEqual(normalized["snapshot_id"], "20260605T133000Z_BUY")
|
||||
self.assertEqual(normalized["fetched_at"], fetched_at)
|
||||
self.assertEqual(normalized["fetched_date"], "2026-06-05")
|
||||
self.assertEqual(normalized["trade_type"], "BUY")
|
||||
self.assertEqual(normalized["adv_no"], "6f8b2e12345")
|
||||
self.assertEqual(normalized["asset"], "USDT")
|
||||
self.assertEqual(normalized["fiat"], "VES")
|
||||
self.assertEqual(normalized["price"], 58.50)
|
||||
self.assertEqual(normalized["surplus_amount"], 1520.43)
|
||||
self.assertEqual(normalized["min_amount"], 100.0)
|
||||
self.assertEqual(normalized["max_amount"], 5000.0)
|
||||
self.assertEqual(normalized["tradable_quantity"], 1520.43)
|
||||
self.assertEqual(normalized["advertiser_no"], "ABC123456")
|
||||
self.assertEqual(normalized["advertiser_name"], "CryptoTraderVE")
|
||||
self.assertEqual(normalized["advertiser_type"], "merchant")
|
||||
self.assertEqual(normalized["month_order_count"], 342)
|
||||
self.assertEqual(normalized["month_finish_rate"], 0.97)
|
||||
self.assertEqual(normalized["positive_rate"], 0.99)
|
||||
self.assertEqual(normalized["user_positive_rate"], 0.99)
|
||||
self.assertEqual(normalized["payment_methods"], ["BANESCO", "PAGO_MOVIL"])
|
||||
self.assertEqual(normalized["payment_method_ids"], ["Banco_Banesco", "Pago_Movil"])
|
||||
self.assertEqual(normalized["price_type"], "FIXED")
|
||||
|
||||
# Verify ad creation time parsed from 1749128400000ms
|
||||
expected_create_time = datetime.fromtimestamp(1749128400000 / 1000, tz=timezone.utc)
|
||||
self.assertEqual(normalized["ad_created_at"], expected_create_time)
|
||||
|
||||
def test_normalize_sell_ad(self):
|
||||
with open(self.sell_json_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
raw_ad = data["data"][0]
|
||||
fetched_at = datetime(2026, 6, 5, 13, 30, 0, tzinfo=timezone.utc)
|
||||
|
||||
normalized = normalize_ad(raw_ad, "SELL", fetched_at)
|
||||
|
||||
self.assertEqual(normalized["snapshot_id"], "20260605T133000Z_SELL")
|
||||
self.assertEqual(normalized["trade_type"], "SELL")
|
||||
self.assertEqual(normalized["adv_no"], "7a9c3d98765")
|
||||
self.assertEqual(normalized["price"], 62.30)
|
||||
self.assertEqual(normalized["payment_methods"], ["MERCANTIL", "PAGO_MOVIL"])
|
||||
self.assertEqual(normalized["payment_method_ids"], ["Banco_Mercantil", "Pago_Movil"])
|
||||
|
||||
def test_defensive_handling(self):
|
||||
# Test handling missing or corrupted keys
|
||||
bad_raw_ad = {
|
||||
"adv": {
|
||||
"advNo": "bad_ad",
|
||||
"price": "not_a_float",
|
||||
"surplusAmount": None
|
||||
},
|
||||
"advertiser": {
|
||||
"userNo": "bad_advertiser",
|
||||
"monthOrderCount": "not_an_int"
|
||||
}
|
||||
}
|
||||
|
||||
fetched_at = datetime.now(timezone.utc)
|
||||
normalized = normalize_ad(bad_raw_ad, "BUY", fetched_at)
|
||||
|
||||
# Should not crash, should fall back to defaults
|
||||
self.assertEqual(normalized["adv_no"], "bad_ad")
|
||||
self.assertEqual(normalized["price"], 0.0) # fallback
|
||||
self.assertEqual(normalized["surplus_amount"], 0.0) # fallback
|
||||
self.assertEqual(normalized["month_order_count"], 0) # fallback
|
||||
self.assertEqual(normalized["advertiser_no"], "bad_advertiser")
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
72
p2p-collector/tests/test_storage.py
Normal file
72
p2p-collector/tests/test_storage.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
import unittest
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
import pandas as pd
|
||||
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from storage import store_parquet, read_checkpoint, write_checkpoint
|
||||
|
||||
class TestStorage(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
self.base_dir = self.temp_dir.name
|
||||
|
||||
def tearDown(self):
|
||||
self.temp_dir.cleanup()
|
||||
|
||||
def test_store_parquet(self):
|
||||
fetched_at = datetime(2026, 6, 5, 13, 30, 0, tzinfo=timezone.utc)
|
||||
rows = [
|
||||
{"col1": "val1", "col2": 1.5, "col3": True},
|
||||
{"col1": "val2", "col2": 2.5, "col3": False}
|
||||
]
|
||||
|
||||
# Write
|
||||
final_path = store_parquet(rows, self.base_dir, fetched_at)
|
||||
|
||||
# Verify path exists
|
||||
self.assertTrue(os.path.exists(final_path))
|
||||
|
||||
# Verify partition path structure: year=2026/month=06/day=05/snapshot_20260605_133000.parquet
|
||||
expected_subdir = os.path.join(self.base_dir, "year=2026", "month=06", "day=05")
|
||||
self.assertTrue(final_path.startswith(expected_subdir))
|
||||
self.assertTrue(final_path.endswith("snapshot_20260605_133000.parquet"))
|
||||
|
||||
# Verify schema file exists
|
||||
schema_path = os.path.join(expected_subdir, "_schema.parquet")
|
||||
self.assertTrue(os.path.exists(schema_path))
|
||||
|
||||
# Read schema back and verify it's empty but has columns
|
||||
df_schema = pd.read_parquet(schema_path)
|
||||
self.assertEqual(len(df_schema), 0)
|
||||
self.assertEqual(list(df_schema.columns), sorted(["col1", "col2", "col3"]))
|
||||
|
||||
# Read data back and verify content
|
||||
df_data = pd.read_parquet(final_path)
|
||||
self.assertEqual(len(df_data), 2)
|
||||
self.assertEqual(df_data.iloc[0]["col1"], "val1")
|
||||
self.assertEqual(df_data.iloc[1]["col2"], 2.5)
|
||||
|
||||
def test_checkpoint(self):
|
||||
checkpoint_path = os.path.join(self.base_dir, "checkpoint.json")
|
||||
|
||||
# Test missing checkpoint
|
||||
self.assertEqual(read_checkpoint(checkpoint_path), {})
|
||||
|
||||
# Test write and read
|
||||
data = {
|
||||
"last_completed_snapshot": "2026-06-05T13:30:00Z",
|
||||
"last_buy_ad_count": 47,
|
||||
"last_sell_ad_count": 53
|
||||
}
|
||||
write_checkpoint(checkpoint_path, data)
|
||||
|
||||
read_data = read_checkpoint(checkpoint_path)
|
||||
self.assertEqual(read_data, data)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
97
p2p-collector/tests/test_validator.py
Normal file
97
p2p-collector/tests/test_validator.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
import unittest
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import os
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from validator import validate_row, validate_snapshot
|
||||
|
||||
class TestValidator(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.config = {
|
||||
"validation": {
|
||||
"price_min": 1.0,
|
||||
"price_max": 500.0,
|
||||
"reject_zero_finish_rate": True,
|
||||
"reject_zero_surplus": True
|
||||
}
|
||||
}
|
||||
|
||||
self.valid_row = {
|
||||
"adv_no": "12345",
|
||||
"price": 58.50,
|
||||
"surplus_amount": 100.0,
|
||||
"month_finish_rate": 0.95,
|
||||
"month_order_count": 100,
|
||||
"payment_methods": ["BANESCO"],
|
||||
"ad_created_at": datetime.now(timezone.utc)
|
||||
}
|
||||
|
||||
def test_validate_row_valid(self):
|
||||
seen = set()
|
||||
self.assertTrue(validate_row(self.valid_row, self.config, seen))
|
||||
self.assertIn("12345", seen)
|
||||
|
||||
def test_validate_row_duplicate(self):
|
||||
seen = {"12345"}
|
||||
self.assertFalse(validate_row(self.valid_row, self.config, seen))
|
||||
|
||||
def test_validate_row_invalid_price(self):
|
||||
seen = set()
|
||||
|
||||
# Price <= 0
|
||||
row_bad_price = self.valid_row.copy()
|
||||
row_bad_price["price"] = -1.0
|
||||
self.assertFalse(validate_row(row_bad_price, self.config, seen))
|
||||
|
||||
# Price > max
|
||||
row_high_price = self.valid_row.copy()
|
||||
row_high_price["price"] = 1000.0
|
||||
self.assertFalse(validate_row(row_high_price, self.config, seen))
|
||||
|
||||
def test_validate_row_zero_surplus(self):
|
||||
seen = set()
|
||||
row_zero_surplus = self.valid_row.copy()
|
||||
row_zero_surplus["surplus_amount"] = 0.0
|
||||
self.assertFalse(validate_row(row_zero_surplus, self.config, seen))
|
||||
|
||||
def test_validate_row_suspicious_finish(self):
|
||||
seen = set()
|
||||
row_suspicious = self.valid_row.copy()
|
||||
row_suspicious["month_finish_rate"] = 0.0
|
||||
row_suspicious["month_order_count"] = 5
|
||||
self.assertFalse(validate_row(row_suspicious, self.config, seen))
|
||||
|
||||
def test_validate_snapshot_empty(self):
|
||||
with self.assertRaises(ValueError):
|
||||
validate_snapshot([], datetime.now(timezone.utc))
|
||||
|
||||
def test_validate_snapshot_calculations(self):
|
||||
fetched_at = datetime.now(timezone.utc)
|
||||
ads = [
|
||||
# BUY ads
|
||||
{"trade_type": "BUY", "price": 58.00, "payment_methods": ["BANESCO"], "ad_created_at": fetched_at},
|
||||
{"trade_type": "BUY", "price": 59.00, "payment_methods": ["PAGO_MOVIL"], "ad_created_at": fetched_at},
|
||||
# SELL ads
|
||||
{"trade_type": "SELL", "price": 61.00, "payment_methods": ["MERCANTIL"], "ad_created_at": fetched_at},
|
||||
{"trade_type": "SELL", "price": 62.00, "payment_methods": ["BANESCO"], "ad_created_at": fetched_at},
|
||||
]
|
||||
|
||||
summary = validate_snapshot(ads, fetched_at)
|
||||
|
||||
self.assertEqual(summary["buy_count"], 2)
|
||||
self.assertEqual(summary["sell_count"], 2)
|
||||
self.assertEqual(summary["buy_min"], 58.00)
|
||||
self.assertEqual(summary["buy_max"], 59.00)
|
||||
self.assertEqual(summary["sell_min"], 61.00)
|
||||
self.assertEqual(summary["sell_max"], 62.00)
|
||||
self.assertEqual(summary["buy_median"], 58.50)
|
||||
self.assertEqual(summary["sell_median"], 61.50)
|
||||
|
||||
# spread = sell_min - buy_max = 61.00 - 59.00 = 2.00
|
||||
self.assertEqual(summary["spread"], 2.00)
|
||||
self.assertEqual(set(summary["methods"]), {"BANESCO", "PAGO_MOVIL", "MERCANTIL"})
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
15
p2p-collector/utils.py
Normal file
15
p2p-collector/utils.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
import random
|
||||
from datetime import datetime, timezone
|
||||
|
||||
def jitter(interval: float) -> float:
|
||||
"""
|
||||
Returns a value within interval ± 10%
|
||||
"""
|
||||
variation = interval * 0.10
|
||||
return interval + random.uniform(-variation, variation)
|
||||
|
||||
def now_utc() -> datetime:
|
||||
"""
|
||||
Returns the current UTC datetime with timezone info.
|
||||
"""
|
||||
return datetime.now(timezone.utc)
|
||||
147
p2p-collector/validator.py
Normal file
147
p2p-collector/validator.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
import logging
|
||||
from datetime import datetime, timezone
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def validate_row(row: dict, config: dict, seen_adv_nos: set) -> bool:
|
||||
"""
|
||||
Validates a single normalized ad row.
|
||||
Returns True if the row is valid, or False if it should be rejected.
|
||||
"""
|
||||
val_config = config.get("validation", {})
|
||||
price_min = val_config.get("price_min", 1.0)
|
||||
price_max = val_config.get("price_max", 500.0)
|
||||
reject_zero_finish = val_config.get("reject_zero_finish_rate", True)
|
||||
reject_zero_surplus = val_config.get("reject_zero_surplus", True)
|
||||
|
||||
# 1. Empty adv_no
|
||||
adv_no = row.get("adv_no")
|
||||
if not adv_no:
|
||||
logger.error("Rejecting ad: Missing adv_no.")
|
||||
return False
|
||||
|
||||
# 2. Duplicate adv_no within same snapshot
|
||||
if adv_no in seen_adv_nos:
|
||||
logger.warning(f"Rejecting ad {adv_no}: Duplicate within the same snapshot.")
|
||||
return False
|
||||
|
||||
# 3. Price is None or <= 0
|
||||
price = row.get("price")
|
||||
if price is None or price <= 0:
|
||||
logger.warning(f"Rejecting ad {adv_no}: Price is None or <= 0 ({price}).")
|
||||
return False
|
||||
|
||||
# 4. Price outside expected range
|
||||
if price < price_min or price > price_max:
|
||||
logger.warning(
|
||||
f"Rejecting ad {adv_no}: Price {price} is outside configured range [{price_min}, {price_max}]."
|
||||
)
|
||||
return False
|
||||
|
||||
# 5. Surplus amount None or <= 0
|
||||
surplus = row.get("surplus_amount")
|
||||
if reject_zero_surplus and (surplus is None or surplus <= 0):
|
||||
logger.debug(f"Rejecting ad {adv_no}: Surplus amount is None or <= 0 ({surplus}).")
|
||||
return False
|
||||
|
||||
# 6. Suspicious advertiser stats: monthFinishRate is 0.0 and monthOrderCount > 0
|
||||
finish_rate = row.get("month_finish_rate")
|
||||
order_count = row.get("month_order_count")
|
||||
if reject_zero_finish and finish_rate == 0.0 and order_count > 0:
|
||||
logger.warning(
|
||||
f"Rejecting ad {adv_no}: Advertiser finished 0.0% of {order_count} orders."
|
||||
)
|
||||
return False
|
||||
|
||||
seen_adv_nos.add(adv_no)
|
||||
return True
|
||||
|
||||
|
||||
def validate_snapshot(flat_ads: list, fetched_at: datetime) -> dict:
|
||||
"""
|
||||
Validates a list of all normalized and filtered ads in a single snapshot.
|
||||
Raises ValueError on critical issues (like completely empty snapshot).
|
||||
Returns a dictionary of summary statistics for logging/checking.
|
||||
"""
|
||||
buy_ads = [ad for ad in flat_ads if ad.get("trade_type") == "BUY"]
|
||||
sell_ads = [ad for ad in flat_ads if ad.get("trade_type") == "SELL"]
|
||||
|
||||
# 1. Empty snapshot validation
|
||||
if not buy_ads and not sell_ads:
|
||||
raise ValueError("CRITICAL: Empty snapshot! Both BUY and SELL ad counts are 0.")
|
||||
|
||||
buy_count = len(buy_ads)
|
||||
sell_count = len(sell_ads)
|
||||
|
||||
# Warnings for low/high counts
|
||||
if buy_count < 20 or buy_count > 200:
|
||||
logger.warning(f"Unusual BUY ad count: {buy_count} (expected 20-200).")
|
||||
if sell_count < 20 or sell_count > 200:
|
||||
logger.warning(f"Unusual SELL ad count: {sell_count} (expected 20-200).")
|
||||
|
||||
# Extract prices
|
||||
buy_prices = [ad["price"] for ad in buy_ads]
|
||||
sell_prices = [ad["price"] for ad in sell_ads]
|
||||
|
||||
buy_min = min(buy_prices) if buy_prices else 0.0
|
||||
buy_max = max(buy_prices) if buy_prices else 0.0
|
||||
sell_min = min(sell_prices) if sell_prices else 0.0
|
||||
sell_max = max(sell_prices) if sell_prices else 0.0
|
||||
|
||||
# Calculate medians
|
||||
buy_median = float(np.median(buy_prices)) if buy_prices else 0.0
|
||||
sell_median = float(np.median(sell_prices)) if sell_prices else 0.0
|
||||
|
||||
# Calculate spread: SELL_min - BUY_max
|
||||
# Wait, spec says: spread = SELL_min - BUY_max
|
||||
spread = sell_min - buy_max if (sell_prices and buy_prices) else 0.0
|
||||
|
||||
# Under normal market conditions, advertisers charge a premium when they sell crypto
|
||||
# to you (i.e. sell_ads: you BUY from advertiser, so you pay advertiser's SELL price).
|
||||
# Wait, let's verify what the trade types mean in the spec:
|
||||
# "tradeType: BUY = advertiser wants to give you VES in exchange for your USDT. They are buying USDT from you."
|
||||
# So advertiser is BUYING crypto. Since they want to buy, they want to pay as little VES as possible.
|
||||
# "tradeType: SELL = advertiser wants to give you USDT in exchange for your VES. They are selling USDT to you."
|
||||
# So advertiser is SELLING crypto. Since they are selling, they want to receive as much VES as possible.
|
||||
# Therefore, advertiser's SELL price should be higher than advertiser's BUY price.
|
||||
# So SELL_min should be higher than BUY_max.
|
||||
# If not (e.g., BUY_max > SELL_min), we have a negative spread or overlap.
|
||||
if sell_prices and buy_prices:
|
||||
if buy_max > sell_min:
|
||||
logger.warning(
|
||||
f"BUY/SELL price overlap detected! Max BUY price ({buy_max:.2f}) > Min SELL price ({sell_min:.2f})."
|
||||
)
|
||||
|
||||
# Check for stale ads (createTime > 7 days old)
|
||||
stale_count = 0
|
||||
for ad in flat_ads:
|
||||
created_at = ad.get("ad_created_at")
|
||||
if created_at:
|
||||
age_days = (fetched_at - created_at).total_seconds() / (24 * 3600)
|
||||
if age_days > 7.0:
|
||||
stale_count += 1
|
||||
|
||||
if stale_count > 0:
|
||||
logger.warning(f"Stale ads detected: {stale_count} ads were created > 7 days ago.")
|
||||
|
||||
# Get unique payment methods
|
||||
all_methods = set()
|
||||
for ad in flat_ads:
|
||||
all_methods.update(ad.get("payment_methods", []))
|
||||
|
||||
summary = {
|
||||
"buy_count": buy_count,
|
||||
"sell_count": sell_count,
|
||||
"buy_min": buy_min,
|
||||
"buy_max": buy_max,
|
||||
"sell_min": sell_min,
|
||||
"sell_max": sell_max,
|
||||
"buy_median": buy_median,
|
||||
"sell_median": sell_median,
|
||||
"spread": spread,
|
||||
"stale_count": stale_count,
|
||||
"methods": sorted(list(all_methods)),
|
||||
}
|
||||
|
||||
return summary
|
||||
Loading…
Reference in a new issue