feat: implement binance p2p collector daemon
Set up continuous P2P VES/USDT market history data collection, normalization, validation, and date-partitioned Parquet storage.
This commit is contained in:
parent
8e7a77fe61
commit
2c41a7a6b3
19 changed files with 2313 additions and 0 deletions
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
p2p-collector/data/
|
||||||
|
*.log
|
||||||
|
*.tmp
|
||||||
|
*.alert
|
||||||
|
checkpoint.json
|
||||||
760
base_plan.md
Normal file
760
base_plan.md
Normal file
|
|
@ -0,0 +1,760 @@
|
||||||
|
# Binance P2P Data Collection — Detailed Implementation Spec
|
||||||
|
|
||||||
|
> **Purpose:** This document is the single source of truth for the data collection phase. Every field, every endpoint, every edge case is specified so a coder can implement without ambiguity.
|
||||||
|
>
|
||||||
|
> **Status:** Phase 1 — Data Collection only. No ML. No trading. No algorithm decisions yet.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. The Core Loop (Exact Pseudocode)
|
||||||
|
|
||||||
|
```
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
buy_snap = fetch_all_ads(tradeType="BUY", asset="USDT", fiat="VES")
|
||||||
|
sell_snap = fetch_all_ads(tradeType="SELL", asset="USDT", fiat="VES")
|
||||||
|
|
||||||
|
flat_buy = [normalize_ad(ad, "BUY", now_utc) for ad in buy_snap]
|
||||||
|
flat_sell = [normalize_ad(ad, "SELL", now_utc) for ad in sell_snap]
|
||||||
|
|
||||||
|
validate_snapshot(flat_buy + flat_sell)
|
||||||
|
|
||||||
|
store_parquet(flat_buy, base_path / "raw" / "buy_ads" / date_partition)
|
||||||
|
store_parquet(flat_sell, base_path / "raw" / "sell_ads" / date_partition)
|
||||||
|
|
||||||
|
log_success(len(flat_buy), len(flat_sell), elapsed)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log_error(e, consecutive_failures)
|
||||||
|
consecutive_failures += 1
|
||||||
|
if consecutive_failures >= 5:
|
||||||
|
write_alert_file() # human needs to check
|
||||||
|
|
||||||
|
sleep(jitter(interval_seconds)) # default 300s ± 10%
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. API Client — Exact Implementation
|
||||||
|
|
||||||
|
### 2.1 Endpoint
|
||||||
|
|
||||||
|
```
|
||||||
|
POST https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search
|
||||||
|
```
|
||||||
|
|
||||||
|
**No API key.** This is fully public.
|
||||||
|
|
||||||
|
### 2.2 Headers
|
||||||
|
|
||||||
|
| Header | Value |
|
||||||
|
|---|---|
|
||||||
|
| `Content-Type` | `application/json` |
|
||||||
|
| `User-Agent` | `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36` |
|
||||||
|
| `Accept` | `*/*` |
|
||||||
|
| `Origin` | `https://p2p.binance.com` |
|
||||||
|
| `Referer` | `https://p2p.binance.com/` |
|
||||||
|
|
||||||
|
### 2.3 Request Body (BUY example)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"asset": "USDT",
|
||||||
|
"fiat": "VES",
|
||||||
|
"tradeType": "BUY",
|
||||||
|
"page": 1,
|
||||||
|
"rows": 20,
|
||||||
|
"payTypes": [],
|
||||||
|
"countries": [],
|
||||||
|
"publisherType": null,
|
||||||
|
"classify": "personal",
|
||||||
|
"filter": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key notes for the coder:**
|
||||||
|
- `tradeType: "BUY"` = advertiser wants to **give you VES** in exchange for your USDT. They are *buying* USDT from you.
|
||||||
|
- `tradeType: "SELL"` = advertiser wants to **give you USDT** in exchange for your VES. They are *selling* USDT to you.
|
||||||
|
- `payTypes: []` = no filter, return all payment methods
|
||||||
|
- `rows: 20` = Binance's max per page (do not change)
|
||||||
|
- `publisherType: null` = both merchants and regular users
|
||||||
|
- `classify: "personal"` = personal ads (not business) — covers the P2P marketplace
|
||||||
|
|
||||||
|
### 2.4 Pagination Logic
|
||||||
|
|
||||||
|
```python
|
||||||
|
def fetch_all_ads(trade_type, asset, fiat, max_pages=10):
|
||||||
|
all_ads = []
|
||||||
|
|
||||||
|
for page in range(1, max_pages + 1):
|
||||||
|
body = {
|
||||||
|
"asset": asset,
|
||||||
|
"fiat": fiat,
|
||||||
|
"tradeType": trade_type,
|
||||||
|
"page": page,
|
||||||
|
"rows": 20,
|
||||||
|
"payTypes": [],
|
||||||
|
"countries": [],
|
||||||
|
"publisherType": None,
|
||||||
|
"classify": "personal",
|
||||||
|
"filter": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = httpx.post(URL, json=body, headers=HEADERS, timeout=15)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
data = resp.json()
|
||||||
|
|
||||||
|
if not data.get("success"):
|
||||||
|
raise APIError(f"API returned success=false: {data}")
|
||||||
|
|
||||||
|
ads = data.get("data", [])
|
||||||
|
total = data.get("total", 0)
|
||||||
|
|
||||||
|
all_ads.extend(ads)
|
||||||
|
|
||||||
|
# Stop if we've collected all available ads
|
||||||
|
if len(all_ads) >= total:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Don't request a page that starts beyond total ads
|
||||||
|
if page * 20 >= total:
|
||||||
|
break
|
||||||
|
|
||||||
|
if page < max_pages:
|
||||||
|
time.sleep(0.5) # 500ms between pages
|
||||||
|
|
||||||
|
return all_ads
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.5 Rate Limiting — Defensive Strategy
|
||||||
|
|
||||||
|
| Event | Wait time | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| Between pages (same snapshot) | 500 ms | Fixed |
|
||||||
|
| Between snapshots (BUY → SELL) | 1 second | Fixed |
|
||||||
|
| Between full cycles | 300 s ± 30s | Jittered to avoid clock sync |
|
||||||
|
| HTTP 429 (rate limited) | 60s → 120s → 240s → 480s | Exponential backoff, cap at 480s |
|
||||||
|
| Connection error | 30s retry | Transient network issues |
|
||||||
|
| 5xx server error | 60s retry | Binance server-side issues |
|
||||||
|
|
||||||
|
**Important:** After a 429, reset the backoff after one successful full snapshot.
|
||||||
|
|
||||||
|
### 2.6 Proxy Support (Optional — keep simple first)
|
||||||
|
|
||||||
|
Start with **no proxy**, direct from VPS. Only add proxy rotation if we hit rate limits. Binance rarely rate-limits P2P at 1 request/5min.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Normalization — Exact Field Mapping
|
||||||
|
|
||||||
|
### 3.1 The Flattened Schema (one row = one ad)
|
||||||
|
|
||||||
|
| # | Output field | Type | JSON path | Notes |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| 1 | `snapshot_id` | string | auto: `{fetch_ts_iso}_{trade_type}` | e.g. `"20260605T133000Z_BUY"` |
|
||||||
|
| 2 | `fetched_at` | datetime | auto: now_utc | Always UTC |
|
||||||
|
| 3 | `fetched_date` | string | auto: YYYY-MM-DD | Partition column |
|
||||||
|
| 4 | `trade_type` | string | `adv.tradeType` | "BUY" or "SELL" |
|
||||||
|
| 5 | `adv_no` | string | `adv.advNo` | Unique ad ID |
|
||||||
|
| 6 | `asset` | string | `adv.asset` | "USDT" |
|
||||||
|
| 7 | `fiat` | string | `adv.fiatUnit` | "VES" |
|
||||||
|
| 8 | `price` | float | `adv.price` | Parse as float |
|
||||||
|
| 9 | `surplus_amount` | float | `adv.surplusAmount` | Remaining USDT |
|
||||||
|
| 10 | `min_amount` | float | `adv.minSingleTransAmount` | Min USDT per trade |
|
||||||
|
| 11 | `max_amount` | float | `adv.maxSingleTransAmount` | Max USDT per trade |
|
||||||
|
| 12 | `tradable_quantity` | float | `adv.tradableQuantity` | Same as surplus? |
|
||||||
|
| 13 | `advertiser_no` | string | `advertiser.userNo` | **Stable ID** — use this |
|
||||||
|
| 14 | `advertiser_name` | string | `advertiser.nickName` | For reference only |
|
||||||
|
| 15 | `advertiser_type` | string | `advertiser.userType` | "merchant" or "user" |
|
||||||
|
| 16 | `month_order_count` | int | `advertiser.monthOrderCount` | |
|
||||||
|
| 17 | `month_finish_rate` | float | `advertiser.monthFinishRate` | 0.0 to 1.0 |
|
||||||
|
| 18 | `positive_rate` | float | `advertiser.positiveRate` | 0.0 to 1.0 |
|
||||||
|
| 19 | `user_positive_rate` | float | `advertiser.userPositiveRate` | older field, same idea |
|
||||||
|
| 20 | `payment_methods` | list[str] | `adv.tradeMethods[].payType` | e.g. `["BANESCO", "PAGO_MOVIL"]` |
|
||||||
|
| 21 | `payment_method_ids` | list[str] | `adv.tradeMethods[].identifier` | e.g. `["Banco_Banesco", "Pago_Movil"]` |
|
||||||
|
| 22 | `ad_created_at` | datetime | `adv.createTime` | Unix millisecond → datetime |
|
||||||
|
| 23 | `price_type` | string | `adv.priceType` | Usually "FIXED" |
|
||||||
|
|
||||||
|
### 3.2 JSON Path Details (nested structure)
|
||||||
|
|
||||||
|
The API response has this structure:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"adv": {
|
||||||
|
"advNo": "6f8b2e...",
|
||||||
|
"tradeType": "BUY",
|
||||||
|
"asset": "USDT",
|
||||||
|
"fiatUnit": "VES",
|
||||||
|
"price": "58.50",
|
||||||
|
"surplusAmount": "1520.43",
|
||||||
|
"maxSingleTransAmount": "5000.00",
|
||||||
|
"minSingleTransAmount": "100.00",
|
||||||
|
"tradableQuantity": "1520.43",
|
||||||
|
"createTime": 1749128400000,
|
||||||
|
"fiatSymbol": "Bs",
|
||||||
|
"priceType": "FIXED",
|
||||||
|
"tradeMethods": [
|
||||||
|
{
|
||||||
|
"identifier": "Banco_Banesco",
|
||||||
|
"payType": "BANESCO",
|
||||||
|
"payMethodId": "BANESCO"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"identifier": "Pago_Movil",
|
||||||
|
"payType": "PAGO_MOVIL",
|
||||||
|
"payMethodId": "PAGO_MOVIL"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"advertiser": {
|
||||||
|
"userNo": "ABC123",
|
||||||
|
"nickName": "CryptoTraderVE",
|
||||||
|
"userType": "merchant",
|
||||||
|
"monthOrderCount": 342,
|
||||||
|
"monthFinishRate": 0.97,
|
||||||
|
"positiveRate": 0.99,
|
||||||
|
"userPositiveRate": 0.99
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"total": 156,
|
||||||
|
"pageSize": 20,
|
||||||
|
"success": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.3 Normalization Code Sketch
|
||||||
|
|
||||||
|
```python
|
||||||
|
def normalize_ad(raw_ad: dict, trade_type: str, fetched_at: datetime) -> dict:
|
||||||
|
adv = raw_ad["adv"]
|
||||||
|
adver = raw_ad["advertiser"]
|
||||||
|
|
||||||
|
payment_methods = [m["payType"] for m in adv.get("tradeMethods", [])]
|
||||||
|
payment_method_ids = [m["identifier"] for m in adv.get("tradeMethods", [])]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"snapshot_id": f"{fetched_at.strftime('%Y%m%dT%H%M%SZ')}_{trade_type}",
|
||||||
|
"fetched_at": fetched_at,
|
||||||
|
"fetched_date": fetched_at.strftime("%Y-%m-%d"),
|
||||||
|
"trade_type": trade_type,
|
||||||
|
"adv_no": adv["advNo"],
|
||||||
|
"asset": adv["asset"],
|
||||||
|
"fiat": adv["fiatUnit"],
|
||||||
|
"price": float(adv["price"]),
|
||||||
|
"surplus_amount": float(adv.get("surplusAmount", 0)),
|
||||||
|
"min_amount": float(adv.get("minSingleTransAmount", 0)),
|
||||||
|
"max_amount": float(adv.get("maxSingleTransAmount", 0)),
|
||||||
|
"tradable_quantity": float(adv.get("tradableQuantity", 0)),
|
||||||
|
"advertiser_no": adver["userNo"],
|
||||||
|
"advertiser_name": adver["nickName"],
|
||||||
|
"advertiser_type": adver.get("userType", "user"),
|
||||||
|
"month_order_count": adver.get("monthOrderCount", 0),
|
||||||
|
"month_finish_rate": float(adver.get("monthFinishRate", 0)),
|
||||||
|
"positive_rate": float(adver.get("positiveRate", 0)),
|
||||||
|
"user_positive_rate": float(adver.get("userPositiveRate", 0)),
|
||||||
|
"payment_methods": payment_methods, # e.g. ["BANESCO", "PAGO_MOVIL"]
|
||||||
|
"payment_method_ids": payment_method_ids, # e.g. ["Banco_Banesco", "Pago_Movil"]
|
||||||
|
"ad_created_at": datetime.fromtimestamp(
|
||||||
|
adv["createTime"] / 1000, tz=timezone.utc
|
||||||
|
),
|
||||||
|
"price_type": adv.get("priceType", "FIXED"),
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Payment Methods — The Critical Column
|
||||||
|
|
||||||
|
### 4.1 Known Payment Method Identifiers for Venezuela
|
||||||
|
|
||||||
|
| `payType` value | `identifier` value | Common name |
|
||||||
|
|---|---|---|
|
||||||
|
| `BANESCO` | `Banco_Banesco` | Banesco bank transfer |
|
||||||
|
| `MERCANTIL` | `Banco_Mercantil` | Mercantil bank transfer |
|
||||||
|
| `PROVINCIAL` | `Banco_Provincial` | Banco Provincial (BBVA) |
|
||||||
|
| `VENEZUELA` | `Banco_De_Venezuela` | Banco de Venezuela (BDV) |
|
||||||
|
| `BANCO_NACIONAL_CREDITO` | `Banco_Nacional_De_Credito` | BNC |
|
||||||
|
| `SOFITASA` | `Sofitasa` | Sofitasa |
|
||||||
|
| `BANCAMIGA` | `Bancamiga` | Bancamiga |
|
||||||
|
| `BANCO_EXTERIOR` | `Banco_Exterior` | Banco Exterior |
|
||||||
|
| `BANCO_OCCIDENTE` | `Banco_Occidente` | Banco Occidental de Descuento (BOD) |
|
||||||
|
| `BANCO_PLATA` | `Banco_Plata` | Banco Plaza |
|
||||||
|
| `BANESCO_PERSONAL` | `Banesco_Personal` | Banesco personal account |
|
||||||
|
| `PAGO_MOVIL` | `Pago_Movil` | Mobile payment (inter-bank) |
|
||||||
|
| `BANCANET` | `Bancanet` | Bancanet |
|
||||||
|
| `BANPLUS` | `Banplus` | Banplus |
|
||||||
|
| `ZELLE` | `Zelle` | Zelle (USD, not VES) |
|
||||||
|
| `PAYPAL` | `Paypal` | PayPal (USD) |
|
||||||
|
| `CASH_VEF` | `Efectivo_VEF` | Cash in VES |
|
||||||
|
| `CASH_USD` | `Efectivo_USD` | Cash in USD |
|
||||||
|
| `PAGO_MOVIL` | `Pago_Movil_Banco_Venezuela` | Mobile payment at specific bank |
|
||||||
|
|
||||||
|
### 4.2 Why This Matters for Bank Arbitrage
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Example analysis query after ~1 week of data:
|
||||||
|
# For each snapshot, find the best path:
|
||||||
|
#
|
||||||
|
# Best BUY price (sell USDT → get VES): Banesco, 60.50 VES/USDT
|
||||||
|
# Best SELL price (buy USDT → give VES): Mercantil, 62.30 VES/USDT
|
||||||
|
# Gross arbitrage: 62.30 - 60.50 = 1.80 VES/USDT = ~2.9% spread
|
||||||
|
#
|
||||||
|
# If same bank: you lose 0% on internal transfer
|
||||||
|
# If different banks: you lose bank transfer fee (maybe 0.5%)
|
||||||
|
# Net profit = 2.9% - 0.5% = 2.4% per round trip
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 Storage Consideration
|
||||||
|
|
||||||
|
`payment_methods` is a **list of strings** — this is fine in Parquet (stored as a repeated field). For CSV it would need to be JSON-encoded or one-hot encoded later.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Storage — Exact File Layout
|
||||||
|
|
||||||
|
```
|
||||||
|
/path/to/data/
|
||||||
|
├── raw/
|
||||||
|
│ ├── buy_ads/
|
||||||
|
│ │ └── year=2026/
|
||||||
|
│ │ └── month=06/
|
||||||
|
│ │ └── day=05/
|
||||||
|
│ │ ├── snapshot_20260605_133000.parquet
|
||||||
|
│ │ ├── snapshot_20260605_133500.parquet
|
||||||
|
│ │ └── ...
|
||||||
|
│ ├── sell_ads/
|
||||||
|
│ │ └── year=2026/
|
||||||
|
│ │ └── month=06/
|
||||||
|
│ │ └── day=05/
|
||||||
|
│ │ ├── snapshot_20260605_133000.parquet
|
||||||
|
│ │ └── ...
|
||||||
|
│ └── daily_merged/ <-- OPTIONAL: daily combined view
|
||||||
|
│ └── year=2026/
|
||||||
|
│ └── month=06/
|
||||||
|
│ └── 2026-06-05.parquet
|
||||||
|
│
|
||||||
|
├── logs/
|
||||||
|
│ └── collector_20260605.log
|
||||||
|
│
|
||||||
|
├── alerts/ <-- alert marker files go here
|
||||||
|
│ └── (empty if no issues)
|
||||||
|
│
|
||||||
|
└── checkpoint.json <-- for restart resilience
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.1 File Naming Convention
|
||||||
|
|
||||||
|
**Snapshot files:** `snapshot_{YYYYMMDD}_{HHMMSS}.parquet`
|
||||||
|
- Time used: the start timestamp of the snapshot (UTC)
|
||||||
|
- Example: `snapshot_20260605_133000.parquet`
|
||||||
|
|
||||||
|
**Why no UUIDs?** The timestamp + trade_type partition is already unique. No repeated names unless you run two collectors (don't).
|
||||||
|
|
||||||
|
### 5.2 Atomic Writes (No Partial Files)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def store_parquet(rows, base_dir, fetched_at):
|
||||||
|
if not rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build partition path from timestamp
|
||||||
|
year = fetched_at.strftime("%Y")
|
||||||
|
month = fetched_at.strftime("%m")
|
||||||
|
day = fetched_at.strftime("%d")
|
||||||
|
filename = f"snapshot_{fetched_at.strftime('%Y%m%d_%H%M%S')}.parquet"
|
||||||
|
|
||||||
|
dest_dir = Path(base_dir) / f"year={year}" / f"month={month}" / f"day={day}"
|
||||||
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Write to temp file first
|
||||||
|
tmp_path = dest_dir / (filename + ".tmp")
|
||||||
|
final_path = dest_dir / filename
|
||||||
|
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
df.to_parquet(tmp_path, index=False, engine="pyarrow")
|
||||||
|
|
||||||
|
# Atomic rename
|
||||||
|
tmp_path.rename(final_path)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.3 Schema Consistency Check
|
||||||
|
|
||||||
|
Each snapshot should write a schema marker file once:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# After first successful write per partition, write schema.parquet as a reference
|
||||||
|
schema_path = dest_dir / "_schema.parquet"
|
||||||
|
if not schema_path.exists():
|
||||||
|
df.iloc[:0].to_parquet(schema_path) # empty DataFrame with same schema
|
||||||
|
```
|
||||||
|
|
||||||
|
This allows downstream readers to discover the schema without reading a full snapshot.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Data Validation During Collection
|
||||||
|
|
||||||
|
### 6.1 Row-Level Rejection Rules
|
||||||
|
|
||||||
|
Reject (skip, don't crash) individual ads if:
|
||||||
|
|
||||||
|
| Condition | Why | Action |
|
||||||
|
|---|---|---|
|
||||||
|
| `price` is None or ≤ 0 | Bad data | Log warning, skip |
|
||||||
|
| `surplusAmount` is None or ≤ 0 | Ad has no USDT left | Log debug, skip |
|
||||||
|
| `monthFinishRate` is 0.0 and `monthOrderCount` > 0 | Merchant hasn't completed any orders (suspicious) | Log warning, skip |
|
||||||
|
| `price` < 1.0 or `price` > 500.0 | Way outside VES/USDT normal range (should be ~50–150) | Log warning, skip this ad |
|
||||||
|
| Empty `advNo` | Missing identifier | Log error, skip |
|
||||||
|
| Duplicate `advNo` within same snapshot | Possible API glitch | Log warning, keep first occurrence |
|
||||||
|
|
||||||
|
### 6.2 Snapshot-Level Validation
|
||||||
|
|
||||||
|
After collecting all ads for one snapshot:
|
||||||
|
|
||||||
|
```
|
||||||
|
✅ TOTAL ADS: BUY=47 SELL=53 (should be 20-200 each)
|
||||||
|
✅ PRICE RANGE: BUY [54.20 - 62.80] SELL [58.00 - 68.50]
|
||||||
|
(SELL should be consistently higher than BUY)
|
||||||
|
If not: LOG WARNING "BUY/SELL overlap detected"
|
||||||
|
✅ SPREAD: SELL_min - BUY_max = 58.00 - 62.80 = -4.80
|
||||||
|
(If negative: spread is inverted — unusual but possible)
|
||||||
|
Log: "Current spread: {spread:.2f} VES/USDT"
|
||||||
|
✅ MEDIAN PRICE: BUY=58.30 SELL=63.50
|
||||||
|
✅ AD STALENESS: 0 ads with createTime > 7 days old
|
||||||
|
(If any: they're stale, still keep them, but log it)
|
||||||
|
✅ EMPTY SNAPSHOT: If BUY=0 AND SELL=0 → CRITICAL ALERT
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.3 Snapshot Summary Log Line (one line per snapshot)
|
||||||
|
|
||||||
|
```
|
||||||
|
2026-06-05 13:30:00 UTC | BUY=47 ads [54.20–62.80] SELL=53 ads [58.00–68.50] | spread= -4.80 | took 3.2s | methods=[BANESCO,PAGO_MOVIL,MERCANTIL,...]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Scheduling & Lifecycle
|
||||||
|
|
||||||
|
### 7.1 Startup Behavior
|
||||||
|
|
||||||
|
```
|
||||||
|
1. Read checkpoint.json (if exists)
|
||||||
|
→ "last_completed_snapshot": "2026-06-05T13:25:00Z"
|
||||||
|
→ Wait until (last_completed + interval) before starting
|
||||||
|
→ If checkpoint is missing or corrupted, start immediately
|
||||||
|
|
||||||
|
2. Verify data directory is writable
|
||||||
|
→ Try writing a test file, then delete it
|
||||||
|
|
||||||
|
3. Log: "Starting collector. Interval=300s. Pairs=USDT/VES"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.2 Graceful Shutdown
|
||||||
|
|
||||||
|
```python
|
||||||
|
import signal
|
||||||
|
|
||||||
|
running = True
|
||||||
|
|
||||||
|
def handle_signal(sig, frame):
|
||||||
|
global running
|
||||||
|
logging.info("Received signal %s, finishing current snapshot...", sig)
|
||||||
|
running = False
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, handle_signal)
|
||||||
|
signal.signal(signal.SIGTERM, handle_signal)
|
||||||
|
|
||||||
|
# In main loop:
|
||||||
|
while running:
|
||||||
|
# ... do snapshot ...
|
||||||
|
# Write checkpoint after each successful snapshot
|
||||||
|
write_checkpoint({"last_completed_snapshot": now_utc.isoformat()})
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.3 Checkpoint File Format
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"last_completed_snapshot": "2026-06-05T13:30:00Z",
|
||||||
|
"last_buy_ad_count": 47,
|
||||||
|
"last_sell_ad_count": 53,
|
||||||
|
"consecutive_failures": 0,
|
||||||
|
"total_snapshots": 284,
|
||||||
|
"first_snapshot": "2026-06-01T00:00:00Z",
|
||||||
|
"version": "1.0"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.4 Alert Marker File
|
||||||
|
|
||||||
|
After 5 consecutive failures, write:
|
||||||
|
|
||||||
|
```
|
||||||
|
/path/to/data/alerts/20260605_133000_5_failures.alert
|
||||||
|
```
|
||||||
|
|
||||||
|
Content:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"timestamp": "2026-06-05T13:30:00Z",
|
||||||
|
"error": "HTTP 500 after 3 retries",
|
||||||
|
"consecutive_failures": 5,
|
||||||
|
"traceback": "..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. First-Run Verification Protocol
|
||||||
|
|
||||||
|
After the collector writes its **first snapshot**, the coder should manually verify:
|
||||||
|
|
||||||
|
### Step 1: Read the Parquet file back
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pandas as pd
|
||||||
|
df = pd.read_parquet("data/raw/buy_ads/year=2026/month=06/day=05/snapshot_20260605_133000.parquet")
|
||||||
|
df.info()
|
||||||
|
df.head()
|
||||||
|
```
|
||||||
|
|
||||||
|
Check:
|
||||||
|
- [ ] All columns present (23 columns from spec)
|
||||||
|
- [ ] No null values in critical fields (price, adv_no, advertiser_no)
|
||||||
|
- [ ] `price` is float type, not string
|
||||||
|
- [ ] `fetched_at` is datetime type
|
||||||
|
- [ ] `payment_methods` is a proper list column
|
||||||
|
|
||||||
|
### Step 2: Verify BUY vs SELL logic
|
||||||
|
|
||||||
|
```python
|
||||||
|
buy_ads = df[df["trade_type"] == "BUY"]
|
||||||
|
sell_ads = df[df["trade_type"] == "SELL"]
|
||||||
|
|
||||||
|
print(f"BUY ads count: {len(buy_ads)}")
|
||||||
|
print(f"SELL ads count: {len(sell_ads)}")
|
||||||
|
print(f"BUY price range: {buy_ads['price'].min():.2f} - {buy_ads['price'].max():.2f}")
|
||||||
|
print(f"SELL price range: {sell_ads['price'].min():.2f} - {sell_ads['price'].max():.2f}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: SELL prices are higher than BUY prices (advertiser selling USDT charges a premium vs. buying USDT).
|
||||||
|
|
||||||
|
### Step 3: Verify payment methods are captured
|
||||||
|
|
||||||
|
```python
|
||||||
|
all_methods = set()
|
||||||
|
for methods in df["payment_methods"]:
|
||||||
|
all_methods.update(methods)
|
||||||
|
print(f"Payment methods found: {sorted(all_methods)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: At least BANESCO and PAGO_MOVIL will appear. Possibly 5–15 different banks.
|
||||||
|
|
||||||
|
### Step 4: Verify advertiser diversity
|
||||||
|
|
||||||
|
```python
|
||||||
|
print(f"Unique advertisers: {df['advertiser_no'].nunique()}")
|
||||||
|
print(f"Merchants: {(df['advertiser_type'] == 'merchant').sum()}")
|
||||||
|
print(f"Users: {(df['advertiser_type'] == 'user').sum()}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Run the collector for 1 hour (~12 snapshots) and verify:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ls data/raw/buy_ads/year=2026/month=06/day=05/ | wc -l
|
||||||
|
# Should be ~12
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] No duplicate timestamps
|
||||||
|
- [ ] No gaps > 6 minutes
|
||||||
|
- [ ] No crash/restart in the logs
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. File & Module Structure (Exact)
|
||||||
|
|
||||||
|
```
|
||||||
|
p2p-collector/
|
||||||
|
├── collect_p2p.py # Entry point: argument parsing, main loop
|
||||||
|
├── config.yaml # All configurable settings
|
||||||
|
├── binance_client.py # fetch_all_ads(), pagination, rate limiting
|
||||||
|
├── normalizer.py # normalize_ad(), flatten schema
|
||||||
|
├── storage.py # store_parquet(), atomic writes, checkpoint
|
||||||
|
├── validator.py # validate_row(), validate_snapshot()
|
||||||
|
├── scheduler.py # main loop, sleep/jitter, signal handling
|
||||||
|
├── alert.py # write_alert_file(), logging setup
|
||||||
|
├── utils.py # jitter(), datetime helpers
|
||||||
|
├── requirements.txt # pinned versions
|
||||||
|
├── Makefile # setup, run, clean, test commands
|
||||||
|
├── tests/
|
||||||
|
│ ├── test_normalizer.py # Test with sample API response
|
||||||
|
│ ├── test_storage.py # Test atomic writes
|
||||||
|
│ └── test_validator.py # Test rejection rules
|
||||||
|
├── sample_responses/
|
||||||
|
│ ├── response_buy.json # One real-ish API response for tests
|
||||||
|
│ └── response_sell.json
|
||||||
|
└── README.md # Run instructions
|
||||||
|
```
|
||||||
|
|
||||||
|
### requirements.txt
|
||||||
|
|
||||||
|
```
|
||||||
|
httpx>=0.27,<1.0
|
||||||
|
pandas>=2.0,<3.0
|
||||||
|
pyarrow>=14.0,<16.0
|
||||||
|
pyyaml>=6.0,<7.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: `httpx` over `requests` because it has native timeout support, cleaner API. Fall back to `requests` if the coder prefers.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. `config.yaml` — Complete Reference
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
binance:
|
||||||
|
base_url: "https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search"
|
||||||
|
user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||||
|
timeout_seconds: 15
|
||||||
|
max_pages: 10
|
||||||
|
request_delay_seconds: 0.5
|
||||||
|
|
||||||
|
collection:
|
||||||
|
pairs:
|
||||||
|
- asset: "USDT"
|
||||||
|
fiat: "VES"
|
||||||
|
interval_seconds: 300
|
||||||
|
output_dir: "./data/raw"
|
||||||
|
retry_attempts: 3
|
||||||
|
retry_delay_base_seconds: 10
|
||||||
|
|
||||||
|
validation:
|
||||||
|
price_min: 1.0
|
||||||
|
price_max: 500.0
|
||||||
|
reject_zero_finish_rate: true
|
||||||
|
reject_zero_surplus: true
|
||||||
|
|
||||||
|
logging:
|
||||||
|
level: "INFO"
|
||||||
|
file: "./data/logs/collector.log"
|
||||||
|
max_bytes: 10485760 # 10 MB
|
||||||
|
backup_count: 5
|
||||||
|
format: "%(asctime)s | %(levelname)s | %(message)s"
|
||||||
|
|
||||||
|
alerts:
|
||||||
|
consecutive_failure_threshold: 5
|
||||||
|
alert_dir: "./data/alerts"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Run Modes
|
||||||
|
|
||||||
|
### Mode 1: One-shot test
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python collect_p2p.py --once
|
||||||
|
```
|
||||||
|
|
||||||
|
- Fetches one BUY snapshot + one SELL snapshot
|
||||||
|
- Writes to disk
|
||||||
|
- Prints summary
|
||||||
|
- Exits
|
||||||
|
- Used for: first run, testing, debugging
|
||||||
|
|
||||||
|
### Mode 2: Daemon (continuous)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python collect_p2p.py
|
||||||
|
```
|
||||||
|
|
||||||
|
- Runs forever
|
||||||
|
- Loop with interval
|
||||||
|
- Graceful shutdown on Ctrl+C
|
||||||
|
|
||||||
|
### Mode 3: Backfill (future)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python collect_p2p.py --backfill --start=2026-06-01 --end=2026-06-03
|
||||||
|
```
|
||||||
|
|
||||||
|
- Not needed now
|
||||||
|
- Architecture supports it later
|
||||||
|
|
||||||
|
### Mode 4: Validate-only
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python collect_p2p.py --validate data/raw/buy_ads/year=2026/month=06/day=05/
|
||||||
|
```
|
||||||
|
|
||||||
|
- Reads Parquet files
|
||||||
|
- Runs validation checks
|
||||||
|
- Prints report
|
||||||
|
- No API calls
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Testing the Coder's Work
|
||||||
|
|
||||||
|
Hand this checklist to the coder when they say "it's done":
|
||||||
|
|
||||||
|
| # | Test | How |
|
||||||
|
|---|---|---|
|
||||||
|
| 1 | **API connectivity** | `python collect_p2p.py --once` returns ads without error |
|
||||||
|
| 2 | **Pagination works** | Inspect: total ads fetched vs `total` field from API |
|
||||||
|
| 3 | **Both BUY and SELL** | Both directories have at least one file after `--once` |
|
||||||
|
| 4 | **Schema correct** | `pd.read_parquet(file)` → 23 columns, correct dtypes |
|
||||||
|
| 5 | **Payment methods populated** | At least 3 payment methods in the first snapshot |
|
||||||
|
| 6 | **Atomic write** | Kill the process mid-write (SIGKILL), no partial files remain. Only `.tmp` files |
|
||||||
|
| 7 | **Graceful shutdown** | Ctrl+C during a snapshot → clean exit, last snapshot saved |
|
||||||
|
| 8 | **Restart resilience** | Start collector, kill it, restart → resumes without duplicate timestamps |
|
||||||
|
| 9 | **Rate limiting** | No HTTP 429 in logs after 1 hour of continuous running |
|
||||||
|
| 10 | **Storage efficiency** | 1 hour of data ≤ 3 MB total on disk |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 13. Post-Collection — What the Data Will Look Like After One Week
|
||||||
|
|
||||||
|
| Metric | Expected value |
|
||||||
|
|---|---|
|
||||||
|
| Snapshots collected | ~2,016 (7 days × 288 snapshots/day) |
|
||||||
|
| Total raw ads | ~200,000–400,000 rows |
|
||||||
|
| Storage used | ~20–100 MB |
|
||||||
|
| Unique advertisers | 100–500 |
|
||||||
|
| Unique payment methods | 10–20 |
|
||||||
|
| Price range (BUY) | ~55–65 VES/USDT (fluctuates with parallel dollar) |
|
||||||
|
| Price range (SELL) | ~58–70 VES/USDT |
|
||||||
|
| Typical spread | ~2–6 VES/USDT (3–10%) |
|
||||||
|
|
||||||
|
**After 1 week of collection, we stop and do EDA before any ML decisions.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 14. Known Gotchas / FAQ for the Coder
|
||||||
|
|
||||||
|
**Q: What if the API returns different fields than documented?**
|
||||||
|
A: The normalizer should use `.get()` with defaults for every field. Log a warning if a field is missing that we expected. Don't crash.
|
||||||
|
|
||||||
|
**Q: What if `tradeMethods` is empty?**
|
||||||
|
A: Some ads have no payment methods listed. Store as empty list `[]`. Continue. This is valid data.
|
||||||
|
|
||||||
|
**Q: What timezone should I use?**
|
||||||
|
A: **Everything in UTC.** The user is in VET (UTC-4), but all stored timestamps are UTC. Timezone conversion is only for display.
|
||||||
|
|
||||||
|
**Q: What if the VPS reboots?**
|
||||||
|
A: systemd `Restart=always` handles this. The collector reads the last checkpoint and continues after the appropriate delay.
|
||||||
|
|
||||||
|
**Q: Should I use asyncio?**
|
||||||
|
A: No. Simple synchronous code. The delay between requests (5 minutes) means async provides zero benefit and adds complexity.
|
||||||
|
|
||||||
|
**Q: Can I use SQLite instead of Parquet?**
|
||||||
|
A: You could, but Parquet is more storage-efficient and directly loadable into ML frameworks (Pandas, Polars, PyTorch). Stick with Parquet.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*End of data collection spec. Hand this to the coding agent as the single source of truth.*
|
||||||
36
p2p-collector/Makefile
Normal file
36
p2p-collector/Makefile
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
.PHONY: setup test run run-once clean validate
|
||||||
|
|
||||||
|
VENV = .venv
|
||||||
|
PYTHON = $(VENV)/bin/python3
|
||||||
|
PIP = $(VENV)/bin/pip
|
||||||
|
|
||||||
|
setup: $(VENV)/bin/activate
|
||||||
|
|
||||||
|
$(VENV)/bin/activate: requirements.txt
|
||||||
|
python3 -m venv $(VENV)
|
||||||
|
$(PIP) install --upgrade pip
|
||||||
|
$(PIP) install -r requirements.txt
|
||||||
|
touch $(VENV)/bin/activate
|
||||||
|
|
||||||
|
test: setup
|
||||||
|
$(PYTHON) -m unittest discover -s tests -p "test_*.py"
|
||||||
|
|
||||||
|
run: setup
|
||||||
|
$(PYTHON) collect_p2p.py
|
||||||
|
|
||||||
|
run-once: setup
|
||||||
|
$(PYTHON) collect_p2p.py --once
|
||||||
|
|
||||||
|
validate: setup
|
||||||
|
@if [ -z "$(PATH_TO_VALIDATE)" ]; then \
|
||||||
|
echo "Usage: make validate PATH_TO_VALIDATE=<path>"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
$(PYTHON) collect_p2p.py --validate $(PATH_TO_VALIDATE)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(VENV)
|
||||||
|
find . -type f -name "*.pyc" -delete
|
||||||
|
find . -type d -name "__pycache__" -exec rm -rf {} +
|
||||||
|
find . -type f -name "*.tmp" -delete
|
||||||
|
@echo "Cleanup complete."
|
||||||
103
p2p-collector/README.md
Normal file
103
p2p-collector/README.md
Normal file
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Binance P2P Data Collector
|
||||||
|
|
||||||
|
This tool continuously collects public peer-to-peer (P2P) market advertisements from Binance P2P for Venezuela (VES/USDT), normalizing, validating, and saving them as atomic date-partitioned Parquet files for subsequent exploratory data analysis and arbitrage modeling.
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
p2p-collector/
|
||||||
|
├── collect_p2p.py # Entry point: argument parsing, validation/daemon modes
|
||||||
|
├── config.yaml # Application configuration (endpoints, delays, validation limits)
|
||||||
|
├── binance_client.py # HTTP client, pagination logic, retry, and 429 backoff
|
||||||
|
├── normalizer.py # Converts raw nested API responses into a flat 23-column schema
|
||||||
|
├── validator.py # Row-level filtering and snapshot-level integrity checks
|
||||||
|
├── storage.py # Atomic Parquet writes, schema references, and checkpoints
|
||||||
|
├── scheduler.py # Loop executor, initial start offsets, signal handling
|
||||||
|
├── alert.py # Write alert marker files on 5 consecutive failures & logger setup
|
||||||
|
├── utils.py # Time and sleep/jitter helpers
|
||||||
|
├── requirements.txt # Package dependencies (httpx, pandas, pyarrow, pyyaml)
|
||||||
|
├── Makefile # Automation targets (setup, test, run, clean)
|
||||||
|
└── tests/ # Suite of unit tests for all components
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- **Python 3.8+** (Developed and tested with Python 3.14)
|
||||||
|
- **Make** (utility for running Makefile targets)
|
||||||
|
|
||||||
|
## Installation & Setup
|
||||||
|
|
||||||
|
Set up the Python virtual environment and install all dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make setup
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running the Collector
|
||||||
|
|
||||||
|
### Mode 1: Continuous Daemon Mode
|
||||||
|
Runs indefinitely, fetching snapshots according to the configured interval (default: 5 minutes) with a ±10% sleep jitter to prevent pattern recognition. Handles graceful shutdown on SIGINT/SIGTERM.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make run
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mode 2: One-shot Mode (Test/Debug)
|
||||||
|
Runs exactly one cycle (one BUY snapshot and one SELL snapshot), writes the results to disk, and exits immediately:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make run-once
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mode 3: Validate-Only Mode
|
||||||
|
Validates existing Parquet files without making any network calls. It prints statistics (row count, min/max prices, payment methods) and checks for critical schema issues:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make validate PATH_TO_VALIDATE=data/raw/buy_ads/year=2026/month=06/day=05/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running Tests
|
||||||
|
|
||||||
|
Run the test suite to verify the client, normalizer, storage, and validation behaviors:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Directory Structure
|
||||||
|
|
||||||
|
The data is saved under `./data/` folder inside the project root:
|
||||||
|
|
||||||
|
```
|
||||||
|
data/
|
||||||
|
├── raw/
|
||||||
|
│ ├── buy_ads/
|
||||||
|
│ │ └── year=YYYY/month=MM/day=DD/
|
||||||
|
│ │ ├── _schema.parquet # Empty schema reference
|
||||||
|
│ │ └── snapshot_YYYYMMDD_HHMMSS.parquet # Atomic snapshot data
|
||||||
|
│ └── sell_ads/
|
||||||
|
│ └── year=YYYY/month=MM/day=DD/
|
||||||
|
│ ├── _schema.parquet
|
||||||
|
│ └── snapshot_YYYYMMDD_HHMMSS.parquet
|
||||||
|
├── logs/
|
||||||
|
│ └── collector.log # Rotating logs
|
||||||
|
├── alerts/
|
||||||
|
│ └── YYYYMMDD_HHMMSS_5_failures.alert # Alert marker JSON file (only on failures)
|
||||||
|
└── checkpoint.json # Restart resilience marker
|
||||||
|
```
|
||||||
|
|
||||||
|
## Checkpoint Format
|
||||||
|
|
||||||
|
A checkpoint file is updated on every successful snapshot, ensuring that restarting the daemon will not query the API until the expected interval has passed:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"last_completed_snapshot": "2026-06-05T13:30:00Z",
|
||||||
|
"last_buy_ad_count": 47,
|
||||||
|
"last_sell_ad_count": 53,
|
||||||
|
"consecutive_failures": 0,
|
||||||
|
"total_snapshots": 284,
|
||||||
|
"first_snapshot": "2026-06-01T00:00:00Z",
|
||||||
|
"version": "1.0"
|
||||||
|
}
|
||||||
|
```
|
||||||
80
p2p-collector/alert.py
Normal file
80
p2p-collector/alert.py
Normal file
|
|
@ -0,0 +1,80 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def setup_logging(config: dict) -> None:
|
||||||
|
"""
|
||||||
|
Configures the logger using parameters from the config dictionary.
|
||||||
|
Sets up both a console handler and a rotating file handler.
|
||||||
|
"""
|
||||||
|
log_config = config.get("logging", {})
|
||||||
|
level_str = log_config.get("level", "INFO")
|
||||||
|
level = getattr(logging, level_str.upper(), logging.INFO)
|
||||||
|
|
||||||
|
log_file = log_config.get("file", "./data/logs/collector.log")
|
||||||
|
max_bytes = log_config.get("max_bytes", 10485760)
|
||||||
|
backup_count = log_config.get("backup_count", 5)
|
||||||
|
log_format = log_config.get("format", "%(asctime)s | %(levelname)s | %(message)s")
|
||||||
|
|
||||||
|
# Ensure log directory exists
|
||||||
|
log_dir = os.path.dirname(log_file)
|
||||||
|
if log_dir:
|
||||||
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Set up root logger
|
||||||
|
root_logger = logging.getLogger()
|
||||||
|
root_logger.setLevel(level)
|
||||||
|
|
||||||
|
# Remove existing handlers to avoid duplicates on re-setup
|
||||||
|
for handler in list(root_logger.handlers):
|
||||||
|
root_logger.removeHandler(handler)
|
||||||
|
|
||||||
|
# Console handler
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setFormatter(logging.Formatter(log_format))
|
||||||
|
root_logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
# Rotating file handler
|
||||||
|
try:
|
||||||
|
file_handler = RotatingFileHandler(
|
||||||
|
log_file, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8"
|
||||||
|
)
|
||||||
|
file_handler.setFormatter(logging.Formatter(log_format))
|
||||||
|
root_logger.addHandler(file_handler)
|
||||||
|
logger.info(f"Logging configured successfully. Writing to {log_file}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to set up file logging: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def write_alert_file(config: dict, timestamp: datetime, error_msg: str, consecutive_failures: int, traceback_str: str) -> str:
|
||||||
|
"""
|
||||||
|
Writes an alert marker JSON file to the configured alert directory.
|
||||||
|
Returns the path of the created alert file.
|
||||||
|
"""
|
||||||
|
alerts_config = config.get("alerts", {})
|
||||||
|
alert_dir = alerts_config.get("alert_dir", "./data/alerts")
|
||||||
|
os.makedirs(alert_dir, exist_ok=True)
|
||||||
|
|
||||||
|
time_str = timestamp.strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"{time_str}_{consecutive_failures}_failures.alert"
|
||||||
|
alert_path = os.path.join(alert_dir, filename)
|
||||||
|
|
||||||
|
content = {
|
||||||
|
"timestamp": timestamp.isoformat(),
|
||||||
|
"error": error_msg,
|
||||||
|
"consecutive_failures": consecutive_failures,
|
||||||
|
"traceback": traceback_str
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(alert_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(content, f, indent=4)
|
||||||
|
logger.error(f"ALERT WRITTEN: {alert_path}")
|
||||||
|
return alert_path
|
||||||
|
except Exception as e:
|
||||||
|
logger.critical(f"Failed to write alert file at {alert_path}: {e}")
|
||||||
|
return ""
|
||||||
144
p2p-collector/binance_client.py
Normal file
144
p2p-collector/binance_client.py
Normal file
|
|
@ -0,0 +1,144 @@
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class BinanceP2PError(Exception):
|
||||||
|
"""Base exception for Binance P2P client operations."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class RateLimitError(BinanceP2PError):
|
||||||
|
"""Raised when Binance P2P API returns HTTP 429 (Rate Limited)."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class APIError(BinanceP2PError):
|
||||||
|
"""Raised when the API returns a response with success=false or invalid structure."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class BinanceP2PClient:
|
||||||
|
def __init__(self, config: dict):
|
||||||
|
binance_cfg = config.get("binance", {})
|
||||||
|
self.base_url = binance_cfg.get("base_url", "https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search")
|
||||||
|
self.user_agent = binance_cfg.get("user_agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
|
||||||
|
self.timeout = binance_cfg.get("timeout_seconds", 15)
|
||||||
|
self.max_pages = binance_cfg.get("max_pages", 10)
|
||||||
|
self.page_delay = binance_cfg.get("request_delay_seconds", 0.5)
|
||||||
|
|
||||||
|
self.headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"User-Agent": self.user_agent,
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Origin": "https://p2p.binance.com",
|
||||||
|
"Referer": "https://p2p.binance.com/"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Retry config
|
||||||
|
collection_cfg = config.get("collection", {})
|
||||||
|
self.max_retries = collection_cfg.get("retry_attempts", 3)
|
||||||
|
self.retry_delay_base = collection_cfg.get("retry_delay_base_seconds", 10)
|
||||||
|
|
||||||
|
# 429 Rate Limiting State
|
||||||
|
self.current_429_backoff = 60
|
||||||
|
self.had_429_this_cycle = False
|
||||||
|
|
||||||
|
def reset_429_backoff(self):
|
||||||
|
"""Resets the 429 backoff delay to its initial value (60s)."""
|
||||||
|
if self.current_429_backoff != 60:
|
||||||
|
logger.info("Resetting 429 rate limit backoff to 60s.")
|
||||||
|
self.current_429_backoff = 60
|
||||||
|
|
||||||
|
def double_429_backoff(self):
|
||||||
|
"""Doubles the 429 backoff delay, capping at 480s."""
|
||||||
|
self.had_429_this_cycle = True
|
||||||
|
logger.warning(f"Rate limited (429). Setting next backoff delay to {self.current_429_backoff}s.")
|
||||||
|
time.sleep(self.current_429_backoff)
|
||||||
|
self.current_429_backoff = min(self.current_429_backoff * 2, 480)
|
||||||
|
|
||||||
|
def _post_request_with_retries(self, body: dict) -> dict:
|
||||||
|
"""
|
||||||
|
Executes a POST request to the Binance P2P API with retries for connection
|
||||||
|
and 5xx errors, and special handling for 429 Rate Limits.
|
||||||
|
"""
|
||||||
|
for attempt in range(1, self.max_retries + 1):
|
||||||
|
try:
|
||||||
|
# Use httpx.Client for synchronous calls
|
||||||
|
with httpx.Client(headers=self.headers, timeout=self.timeout) as client:
|
||||||
|
resp = client.post(self.base_url, json=body)
|
||||||
|
|
||||||
|
# Handle 429 specifically
|
||||||
|
if resp.status_code == 429:
|
||||||
|
self.double_429_backoff()
|
||||||
|
raise RateLimitError("HTTP 429 Rate Limited by Binance.")
|
||||||
|
|
||||||
|
# Handle 5xx server errors
|
||||||
|
if 500 <= resp.status_code < 600:
|
||||||
|
logger.warning(f"Binance P2P API returned HTTP {resp.status_code} (attempt {attempt}/{self.max_retries}). Retrying in 60s...")
|
||||||
|
time.sleep(60)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Raise for other HTTP errors (4xx except 429)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# Parse JSON
|
||||||
|
data = resp.json()
|
||||||
|
if not data.get("success"):
|
||||||
|
raise APIError(f"API response success=false: {data}")
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout) as e:
|
||||||
|
logger.warning(f"Connection error occurred: {e} (attempt {attempt}/{self.max_retries}). Retrying in 30s...")
|
||||||
|
if attempt < self.max_retries:
|
||||||
|
time.sleep(30)
|
||||||
|
else:
|
||||||
|
raise BinanceP2PError(f"Failed to connect after {self.max_retries} attempts: {e}")
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logger.error(f"HTTP Error {e.response.status_code}: {e.response.text}")
|
||||||
|
raise BinanceP2PError(f"HTTP Status Error: {e}")
|
||||||
|
|
||||||
|
raise BinanceP2PError("Failed to fetch P2P ads after maximum retries.")
|
||||||
|
|
||||||
|
def fetch_all_ads(self, trade_type: str, asset: str, fiat: str) -> list:
|
||||||
|
"""
|
||||||
|
Fetches all P2P advertisements for a given trade type, asset, and fiat,
|
||||||
|
handling pagination and page-level delays.
|
||||||
|
"""
|
||||||
|
all_ads = []
|
||||||
|
|
||||||
|
for page in range(1, self.max_pages + 1):
|
||||||
|
body = {
|
||||||
|
"asset": asset,
|
||||||
|
"fiat": fiat,
|
||||||
|
"tradeType": trade_type,
|
||||||
|
"page": page,
|
||||||
|
"rows": 20,
|
||||||
|
"payTypes": [],
|
||||||
|
"countries": [],
|
||||||
|
"publisherType": None,
|
||||||
|
"classify": "personal",
|
||||||
|
"filter": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Fetching {trade_type} page {page}/{self.max_pages} for {asset}/{fiat}...")
|
||||||
|
data = self._post_request_with_retries(body)
|
||||||
|
|
||||||
|
ads = data.get("data", [])
|
||||||
|
total = data.get("total", 0)
|
||||||
|
|
||||||
|
all_ads.extend(ads)
|
||||||
|
logger.info(f"Retrieved {len(ads)} ads. Total collected so far: {len(all_ads)}/{total}")
|
||||||
|
|
||||||
|
# Stop if we've collected all available ads
|
||||||
|
if len(all_ads) >= total:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Don't request a page that starts beyond total ads
|
||||||
|
if page * 20 >= total:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Delay between pages
|
||||||
|
if page < self.max_pages:
|
||||||
|
time.sleep(self.page_delay)
|
||||||
|
|
||||||
|
return all_ads
|
||||||
176
p2p-collector/collect_p2p.py
Normal file
176
p2p-collector/collect_p2p.py
Normal file
|
|
@ -0,0 +1,176 @@
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from alert import setup_logging
|
||||||
|
from scheduler import P2PCollectorScheduler
|
||||||
|
|
||||||
|
logger = logging.getLogger("collect_p2p")
|
||||||
|
|
||||||
|
def load_config(config_path: str) -> dict:
|
||||||
|
"""Loads the YAML configuration file."""
|
||||||
|
if not os.path.exists(config_path):
|
||||||
|
print(f"Error: Config file not found at {config_path}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
try:
|
||||||
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
|
return yaml.safe_load(f)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error parsing config file {config_path}: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_parquet_files(path_str: str):
|
||||||
|
"""
|
||||||
|
Validates existing Parquet files at the specified path (file or directory).
|
||||||
|
Prints a report of the contents and validation status.
|
||||||
|
"""
|
||||||
|
path = Path(path_str)
|
||||||
|
if not path.exists():
|
||||||
|
print(f"Error: Path does not exist: {path_str}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
if path.is_file():
|
||||||
|
if path.suffix == ".parquet":
|
||||||
|
files.append(path)
|
||||||
|
elif path.is_dir():
|
||||||
|
files = list(path.glob("**/*.parquet"))
|
||||||
|
|
||||||
|
if not files:
|
||||||
|
print(f"No Parquet files found at {path_str}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Validating {len(files)} Parquet file(s)...")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
total_rows = 0
|
||||||
|
total_buy = 0
|
||||||
|
total_sell = 0
|
||||||
|
critical_errors = []
|
||||||
|
warnings = []
|
||||||
|
|
||||||
|
expected_columns = {
|
||||||
|
"snapshot_id", "fetched_at", "fetched_date", "trade_type", "adv_no",
|
||||||
|
"asset", "fiat", "price", "surplus_amount", "min_amount", "max_amount",
|
||||||
|
"tradable_quantity", "advertiser_no", "advertiser_name", "advertiser_type",
|
||||||
|
"month_order_count", "month_finish_rate", "positive_rate", "user_positive_rate",
|
||||||
|
"payment_methods", "payment_method_ids", "ad_created_at", "price_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
for f in sorted(files):
|
||||||
|
# Skip schema files unless they are specifically targeted
|
||||||
|
if f.name == "_schema.parquet" and len(files) > 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = pd.read_parquet(f)
|
||||||
|
rows = len(df)
|
||||||
|
total_rows += rows
|
||||||
|
|
||||||
|
print(f"File: {f.name} ({rows} rows)")
|
||||||
|
|
||||||
|
# Check columns
|
||||||
|
cols = set(df.columns)
|
||||||
|
missing_cols = expected_columns - cols
|
||||||
|
extra_cols = cols - expected_columns
|
||||||
|
if missing_cols:
|
||||||
|
critical_errors.append(f"{f.name}: Missing expected columns: {missing_cols}")
|
||||||
|
if extra_cols:
|
||||||
|
warnings.append(f"{f.name}: Has extra columns: {extra_cols}")
|
||||||
|
|
||||||
|
# Analyze trade types
|
||||||
|
if "trade_type" in df.columns:
|
||||||
|
buy_cnt = (df["trade_type"] == "BUY").sum()
|
||||||
|
sell_cnt = (df["trade_type"] == "SELL").sum()
|
||||||
|
total_buy += buy_cnt
|
||||||
|
total_sell += sell_cnt
|
||||||
|
print(f" Trade types: BUY={buy_cnt}, SELL={sell_cnt}")
|
||||||
|
|
||||||
|
# Check critical nulls
|
||||||
|
critical_fields = ["price", "adv_no", "advertiser_no"]
|
||||||
|
for col in critical_fields:
|
||||||
|
if col in df.columns:
|
||||||
|
null_cnt = df[col].isnull().sum()
|
||||||
|
if null_cnt > 0:
|
||||||
|
critical_errors.append(f"{f.name}: Column '{col}' has {null_cnt} null values")
|
||||||
|
|
||||||
|
# Check types
|
||||||
|
if "price" in df.columns and not pd.api.types.is_float_dtype(df["price"]):
|
||||||
|
critical_errors.append(f"{f.name}: Column 'price' is not float type")
|
||||||
|
|
||||||
|
# Prices summary
|
||||||
|
if "price" in df.columns and rows > 0:
|
||||||
|
print(f" Price range: [{df['price'].min():.2f} - {df['price'].max():.2f}]")
|
||||||
|
|
||||||
|
# Payment methods
|
||||||
|
if "payment_methods" in df.columns and rows > 0:
|
||||||
|
methods = set()
|
||||||
|
# payment_methods could be list of lists/arrays
|
||||||
|
for item in df["payment_methods"]:
|
||||||
|
if isinstance(item, str):
|
||||||
|
methods.add(item)
|
||||||
|
elif hasattr(item, "__iter__"):
|
||||||
|
methods.update(item)
|
||||||
|
print(f" Payment methods ({len(methods)}): {sorted(list(methods))}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
critical_errors.append(f"{f.name}: Failed to read/validate: {e}")
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("VALIDATION SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Total files validated: {len(files)}")
|
||||||
|
print(f"Total rows: {total_rows}")
|
||||||
|
print(f"Total BUY ads: {total_buy}")
|
||||||
|
print(f"Total SELL ads: {total_sell}")
|
||||||
|
|
||||||
|
print("\nWarnings:")
|
||||||
|
if warnings:
|
||||||
|
for w in warnings:
|
||||||
|
print(f" - {w}")
|
||||||
|
else:
|
||||||
|
print(" None")
|
||||||
|
|
||||||
|
print("\nCritical Errors:")
|
||||||
|
if critical_errors:
|
||||||
|
for err in critical_errors:
|
||||||
|
print(f" - [FAIL] {err}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(" [PASS] No critical validation issues found!")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Binance P2P Data Collector")
|
||||||
|
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml file")
|
||||||
|
parser.add_argument("--once", action="store_true", help="Run a single collection cycle and exit")
|
||||||
|
parser.add_argument("--validate", help="Path to Parquet file or directory to validate")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# If validate-only mode requested
|
||||||
|
if args.validate:
|
||||||
|
validate_parquet_files(args.validate)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Load configuration
|
||||||
|
config = load_config(args.config)
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
setup_logging(config)
|
||||||
|
|
||||||
|
# Initialize scheduler
|
||||||
|
scheduler = P2PCollectorScheduler(config)
|
||||||
|
|
||||||
|
# Run scheduler
|
||||||
|
scheduler.run(once=args.once)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
32
p2p-collector/config.yaml
Normal file
32
p2p-collector/config.yaml
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
binance:
|
||||||
|
base_url: "https://p2p.binance.com/bapi/c2c/v2/friendly/c2c/adv/search"
|
||||||
|
user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||||
|
timeout_seconds: 15
|
||||||
|
max_pages: 10
|
||||||
|
request_delay_seconds: 0.5
|
||||||
|
|
||||||
|
collection:
|
||||||
|
pairs:
|
||||||
|
- asset: "USDT"
|
||||||
|
fiat: "VES"
|
||||||
|
interval_seconds: 300
|
||||||
|
output_dir: "./data/raw"
|
||||||
|
retry_attempts: 3
|
||||||
|
retry_delay_base_seconds: 10
|
||||||
|
|
||||||
|
validation:
|
||||||
|
price_min: 1.0
|
||||||
|
price_max: 2000.0
|
||||||
|
reject_zero_finish_rate: true
|
||||||
|
reject_zero_surplus: true
|
||||||
|
|
||||||
|
logging:
|
||||||
|
level: "INFO"
|
||||||
|
file: "./data/logs/collector.log"
|
||||||
|
max_bytes: 10485760 # 10 MB
|
||||||
|
backup_count: 5
|
||||||
|
format: "%(asctime)s | %(levelname)s | %(message)s"
|
||||||
|
|
||||||
|
alerts:
|
||||||
|
consecutive_failure_threshold: 5
|
||||||
|
alert_dir: "./data/alerts"
|
||||||
99
p2p-collector/normalizer.py
Normal file
99
p2p-collector/normalizer.py
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def normalize_ad(raw_ad: dict, trade_type: str, fetched_at: datetime) -> dict:
|
||||||
|
"""
|
||||||
|
Normalizes a single P2P ad dictionary from the Binance API response
|
||||||
|
into a flattened dictionary schema matching the spec.
|
||||||
|
"""
|
||||||
|
adv = raw_ad.get("adv", {})
|
||||||
|
adver = raw_ad.get("advertiser", {})
|
||||||
|
|
||||||
|
# Check if critical structures are missing
|
||||||
|
if not adv:
|
||||||
|
logger.warning("Ad structure 'adv' is missing in raw ad data.")
|
||||||
|
if not adver:
|
||||||
|
logger.warning("Advertiser structure 'advertiser' is missing in raw ad data.")
|
||||||
|
|
||||||
|
# Extract payment methods
|
||||||
|
trade_methods = adv.get("tradeMethods") or []
|
||||||
|
payment_methods = []
|
||||||
|
payment_method_ids = []
|
||||||
|
for m in trade_methods:
|
||||||
|
if isinstance(m, dict):
|
||||||
|
pay_type = m.get("payType")
|
||||||
|
identifier = m.get("identifier")
|
||||||
|
if pay_type:
|
||||||
|
payment_methods.append(pay_type)
|
||||||
|
if identifier:
|
||||||
|
payment_method_ids.append(identifier)
|
||||||
|
|
||||||
|
# Safe float conversion helper
|
||||||
|
def safe_float(val, default=0.0, field_name=None):
|
||||||
|
if val is None:
|
||||||
|
return default
|
||||||
|
try:
|
||||||
|
return float(val)
|
||||||
|
except (ValueError, TypeError) as e:
|
||||||
|
if field_name:
|
||||||
|
logger.warning(f"Could not convert field '{field_name}' value {val!r} to float: {e}")
|
||||||
|
return default
|
||||||
|
|
||||||
|
# Safe int conversion helper
|
||||||
|
def safe_int(val, default=0, field_name=None):
|
||||||
|
if val is None:
|
||||||
|
return default
|
||||||
|
try:
|
||||||
|
return int(val)
|
||||||
|
except (ValueError, TypeError) as e:
|
||||||
|
if field_name:
|
||||||
|
logger.warning(f"Could not convert field '{field_name}' value {val!r} to int: {e}")
|
||||||
|
return default
|
||||||
|
|
||||||
|
# Convert createTime (milliseconds since epoch) to datetime
|
||||||
|
create_time_ms = adv.get("createTime")
|
||||||
|
if create_time_ms is not None:
|
||||||
|
try:
|
||||||
|
ad_created_at = datetime.fromtimestamp(safe_float(create_time_ms) / 1000.0, tz=timezone.utc)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not parse ad createTime {create_time_ms}: {e}")
|
||||||
|
ad_created_at = fetched_at
|
||||||
|
else:
|
||||||
|
ad_created_at = fetched_at
|
||||||
|
|
||||||
|
# Check for missing expected fields to log warnings, but don't fail
|
||||||
|
required_keys = ["advNo", "asset", "fiatUnit", "price"]
|
||||||
|
for key in required_keys:
|
||||||
|
if key not in adv:
|
||||||
|
logger.warning(f"Expected key '{key}' not found in 'adv' structure of ad: {raw_ad}")
|
||||||
|
|
||||||
|
if "userNo" not in adver:
|
||||||
|
logger.warning(f"Expected key 'userNo' not found in 'advertiser' structure of ad: {raw_ad}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"snapshot_id": f"{fetched_at.strftime('%Y%m%dT%H%M%SZ')}_{trade_type}",
|
||||||
|
"fetched_at": fetched_at,
|
||||||
|
"fetched_date": fetched_at.strftime("%Y-%m-%d"),
|
||||||
|
"trade_type": trade_type,
|
||||||
|
"adv_no": adv.get("advNo", ""),
|
||||||
|
"asset": adv.get("asset", "USDT"),
|
||||||
|
"fiat": adv.get("fiatUnit", "VES"),
|
||||||
|
"price": safe_float(adv.get("price"), 0.0, "price"),
|
||||||
|
"surplus_amount": safe_float(adv.get("surplusAmount"), 0.0, "surplusAmount"),
|
||||||
|
"min_amount": safe_float(adv.get("minSingleTransAmount"), 0.0, "minSingleTransAmount"),
|
||||||
|
"max_amount": safe_float(adv.get("maxSingleTransAmount"), 0.0, "maxSingleTransAmount"),
|
||||||
|
"tradable_quantity": safe_float(adv.get("tradableQuantity"), 0.0, "tradableQuantity"),
|
||||||
|
"advertiser_no": adver.get("userNo", ""),
|
||||||
|
"advertiser_name": adver.get("nickName", ""),
|
||||||
|
"advertiser_type": adver.get("userType", "user"),
|
||||||
|
"month_order_count": safe_int(adver.get("monthOrderCount"), 0, "monthOrderCount"),
|
||||||
|
"month_finish_rate": safe_float(adver.get("monthFinishRate"), 0.0, "monthFinishRate"),
|
||||||
|
"positive_rate": safe_float(adver.get("positiveRate"), 0.0, "positiveRate"),
|
||||||
|
"user_positive_rate": safe_float(adver.get("userPositiveRate"), 0.0, "userPositiveRate"),
|
||||||
|
"payment_methods": payment_methods,
|
||||||
|
"payment_method_ids": payment_method_ids,
|
||||||
|
"ad_created_at": ad_created_at,
|
||||||
|
"price_type": adv.get("priceType", "FIXED"),
|
||||||
|
}
|
||||||
4
p2p-collector/requirements.txt
Normal file
4
p2p-collector/requirements.txt
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
httpx>=0.27,<1.0
|
||||||
|
pandas>=2.0,<3.0
|
||||||
|
pyarrow>=14.0
|
||||||
|
pyyaml>=6.0,<7.0
|
||||||
44
p2p-collector/sample_responses/response_buy.json
Normal file
44
p2p-collector/sample_responses/response_buy.json
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"adv": {
|
||||||
|
"advNo": "6f8b2e12345",
|
||||||
|
"tradeType": "BUY",
|
||||||
|
"asset": "USDT",
|
||||||
|
"fiatUnit": "VES",
|
||||||
|
"price": "58.50",
|
||||||
|
"surplusAmount": "1520.43",
|
||||||
|
"maxSingleTransAmount": "5000.00",
|
||||||
|
"minSingleTransAmount": "100.00",
|
||||||
|
"tradableQuantity": "1520.43",
|
||||||
|
"createTime": 1749128400000,
|
||||||
|
"fiatSymbol": "Bs",
|
||||||
|
"priceType": "FIXED",
|
||||||
|
"tradeMethods": [
|
||||||
|
{
|
||||||
|
"identifier": "Banco_Banesco",
|
||||||
|
"payType": "BANESCO",
|
||||||
|
"payMethodId": "BANESCO"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"identifier": "Pago_Movil",
|
||||||
|
"payType": "PAGO_MOVIL",
|
||||||
|
"payMethodId": "PAGO_MOVIL"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"advertiser": {
|
||||||
|
"userNo": "ABC123456",
|
||||||
|
"nickName": "CryptoTraderVE",
|
||||||
|
"userType": "merchant",
|
||||||
|
"monthOrderCount": 342,
|
||||||
|
"monthFinishRate": 0.97,
|
||||||
|
"positiveRate": 0.99,
|
||||||
|
"userPositiveRate": 0.99
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"total": 1,
|
||||||
|
"pageSize": 20,
|
||||||
|
"success": true
|
||||||
|
}
|
||||||
44
p2p-collector/sample_responses/response_sell.json
Normal file
44
p2p-collector/sample_responses/response_sell.json
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"adv": {
|
||||||
|
"advNo": "7a9c3d98765",
|
||||||
|
"tradeType": "SELL",
|
||||||
|
"asset": "USDT",
|
||||||
|
"fiatUnit": "VES",
|
||||||
|
"price": "62.30",
|
||||||
|
"surplusAmount": "2500.00",
|
||||||
|
"maxSingleTransAmount": "10000.00",
|
||||||
|
"minSingleTransAmount": "500.00",
|
||||||
|
"tradableQuantity": "2500.00",
|
||||||
|
"createTime": 1749129000000,
|
||||||
|
"fiatSymbol": "Bs",
|
||||||
|
"priceType": "FIXED",
|
||||||
|
"tradeMethods": [
|
||||||
|
{
|
||||||
|
"identifier": "Banco_Mercantil",
|
||||||
|
"payType": "MERCANTIL",
|
||||||
|
"payMethodId": "MERCANTIL"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"identifier": "Pago_Movil",
|
||||||
|
"payType": "PAGO_MOVIL",
|
||||||
|
"payMethodId": "PAGO_MOVIL"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"advertiser": {
|
||||||
|
"userNo": "XYZ789012",
|
||||||
|
"nickName": "MercantilSeller",
|
||||||
|
"userType": "merchant",
|
||||||
|
"monthOrderCount": 512,
|
||||||
|
"monthFinishRate": 0.99,
|
||||||
|
"positiveRate": 0.98,
|
||||||
|
"userPositiveRate": 0.98
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"total": 1,
|
||||||
|
"pageSize": 20,
|
||||||
|
"success": true
|
||||||
|
}
|
||||||
240
p2p-collector/scheduler.py
Normal file
240
p2p-collector/scheduler.py
Normal file
|
|
@ -0,0 +1,240 @@
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import signal
|
||||||
|
import logging
|
||||||
|
import traceback
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from binance_client import BinanceP2PClient, BinanceP2PError
|
||||||
|
from normalizer import normalize_ad
|
||||||
|
from validator import validate_row, validate_snapshot
|
||||||
|
from storage import store_parquet, read_checkpoint, write_checkpoint
|
||||||
|
from alert import write_alert_file
|
||||||
|
from utils import jitter, now_utc
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class P2PCollectorScheduler:
|
||||||
|
def __init__(self, config: dict):
|
||||||
|
self.config = config
|
||||||
|
self.running = True
|
||||||
|
|
||||||
|
# Configure directories
|
||||||
|
collection_cfg = config.get("collection", {})
|
||||||
|
self.output_dir = collection_cfg.get("output_dir", "./data/raw")
|
||||||
|
self.interval = collection_cfg.get("interval_seconds", 300)
|
||||||
|
self.pairs = collection_cfg.get("pairs", [{"asset": "USDT", "fiat": "VES"}])
|
||||||
|
|
||||||
|
# Checkpoint path: in the parent folder of output_dir (or same directory if raw)
|
||||||
|
# Spec says:
|
||||||
|
# data/
|
||||||
|
# ├── raw/
|
||||||
|
# └── checkpoint.json
|
||||||
|
# So we look at parent of output_dir
|
||||||
|
self.data_dir = str(Path(self.output_dir).parent)
|
||||||
|
self.checkpoint_path = os.path.join(self.data_dir, "checkpoint.json")
|
||||||
|
|
||||||
|
self.client = BinanceP2PClient(config)
|
||||||
|
self.consecutive_failures = 0
|
||||||
|
|
||||||
|
# Signal handlers
|
||||||
|
signal.signal(signal.SIGINT, self._handle_signal)
|
||||||
|
signal.signal(signal.SIGTERM, self._handle_signal)
|
||||||
|
|
||||||
|
def _handle_signal(self, sig, frame):
|
||||||
|
logger.info(f"Received signal {sig}. Initiating graceful shutdown after current snapshot completes...")
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
def verify_directories(self):
|
||||||
|
"""Verifies that the output directory is writable by writing a test file and deleting it."""
|
||||||
|
os.makedirs(self.output_dir, exist_ok=True)
|
||||||
|
test_file = os.path.join(self.output_dir, ".write_test")
|
||||||
|
try:
|
||||||
|
with open(test_file, "w") as f:
|
||||||
|
f.write("test")
|
||||||
|
os.remove(test_file)
|
||||||
|
logger.info(f"Directory write verification passed for {self.output_dir}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.critical(f"Directory verification failed on {self.output_dir}: {e}")
|
||||||
|
raise OSError(f"Output directory not writable: {e}")
|
||||||
|
|
||||||
|
def get_initial_wait_seconds(self) -> float:
|
||||||
|
"""Reads checkpoint to determine how long to wait before starting the loop."""
|
||||||
|
checkpoint = read_checkpoint(self.checkpoint_path)
|
||||||
|
last_completed = checkpoint.get("last_completed_snapshot")
|
||||||
|
if not last_completed:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
try:
|
||||||
|
last_dt = datetime.fromisoformat(last_completed)
|
||||||
|
# Make sure timezone aware UTC
|
||||||
|
if last_dt.tzinfo is None:
|
||||||
|
last_dt = last_dt.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
elapsed = (now_utc() - last_dt).total_seconds()
|
||||||
|
wait_time = self.interval - elapsed
|
||||||
|
if wait_time > 0:
|
||||||
|
logger.info(f"Resuming. Last snapshot completed {elapsed:.1f}s ago. Initial wait time: {wait_time:.1f}s.")
|
||||||
|
return wait_time
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error parsing last snapshot time from checkpoint: {e}. Starting immediately.")
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def run_single_cycle(self) -> dict:
|
||||||
|
"""Runs a single snapshot collection cycle for all pairs."""
|
||||||
|
cycle_start_time = now_utc()
|
||||||
|
cycle_stats = {}
|
||||||
|
|
||||||
|
# We process each pair configured
|
||||||
|
for pair in self.pairs:
|
||||||
|
asset = pair.get("asset", "USDT")
|
||||||
|
fiat = pair.get("fiat", "VES")
|
||||||
|
|
||||||
|
start_ts = time.time()
|
||||||
|
|
||||||
|
# 1. Fetch raw advertisements
|
||||||
|
# Delay between trade types (snapshots) is 1 second
|
||||||
|
buy_raw = self.client.fetch_all_ads("BUY", asset, fiat)
|
||||||
|
time.sleep(1.0)
|
||||||
|
sell_raw = self.client.fetch_all_ads("SELL", asset, fiat)
|
||||||
|
|
||||||
|
# 2. Normalize and Filter individual rows
|
||||||
|
seen_adv_nos = set()
|
||||||
|
flat_buy = []
|
||||||
|
for ad in buy_raw:
|
||||||
|
norm = normalize_ad(ad, "BUY", cycle_start_time)
|
||||||
|
if validate_row(norm, self.config, seen_adv_nos):
|
||||||
|
flat_buy.append(norm)
|
||||||
|
|
||||||
|
flat_sell = []
|
||||||
|
for ad in sell_raw:
|
||||||
|
norm = normalize_ad(ad, "SELL", cycle_start_time)
|
||||||
|
if validate_row(norm, self.config, seen_adv_nos):
|
||||||
|
flat_sell.append(norm)
|
||||||
|
|
||||||
|
# 3. Snapshot-level Validation
|
||||||
|
combined_ads = flat_buy + flat_sell
|
||||||
|
validation_summary = validate_snapshot(combined_ads, cycle_start_time)
|
||||||
|
|
||||||
|
# 4. Storage (atomic writes to raw/buy_ads and raw/sell_ads)
|
||||||
|
buy_path = store_parquet(flat_buy, os.path.join(self.output_dir, "buy_ads"), cycle_start_time)
|
||||||
|
sell_path = store_parquet(flat_sell, os.path.join(self.output_dir, "sell_ads"), cycle_start_time)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_ts
|
||||||
|
|
||||||
|
# 5. Snapshot Summary Log Line
|
||||||
|
# Format: 2026-06-05 13:30:00 UTC | BUY=47 ads [54.20–62.80] SELL=53 ads [58.00–68.50] | spread= -4.80 | took 3.2s | methods=[BANESCO,PAGO_MOVIL,MERCANTIL,...]
|
||||||
|
methods_str = ",".join(validation_summary["methods"])
|
||||||
|
logger.info(
|
||||||
|
f"{cycle_start_time.strftime('%Y-%m-%d %H:%M:%S')} UTC | "
|
||||||
|
f"BUY={validation_summary['buy_count']} ads [{validation_summary['buy_min']:.2f}-{validation_summary['buy_max']:.2f}] "
|
||||||
|
f"SELL={validation_summary['sell_count']} ads [{validation_summary['sell_min']:.2f}-{validation_summary['sell_max']:.2f}] | "
|
||||||
|
f"spread={validation_summary['spread']:.2f} | took {elapsed:.1f}s | "
|
||||||
|
f"methods=[{methods_str}]"
|
||||||
|
)
|
||||||
|
|
||||||
|
cycle_stats[f"{asset}_{fiat}"] = {
|
||||||
|
"buy_count": validation_summary['buy_count'],
|
||||||
|
"sell_count": validation_summary['sell_count'],
|
||||||
|
"timestamp": cycle_start_time.isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
return cycle_stats
|
||||||
|
|
||||||
|
def run(self, once: bool = False):
|
||||||
|
"""Starts the main execution loop."""
|
||||||
|
self.verify_directories()
|
||||||
|
|
||||||
|
if once:
|
||||||
|
logger.info("Executing a single collection cycle (--once)...")
|
||||||
|
try:
|
||||||
|
self.run_single_cycle()
|
||||||
|
logger.info("One-shot collection complete. Exiting.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error occurred during one-shot collection: {e}")
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
raise e
|
||||||
|
return
|
||||||
|
|
||||||
|
# Continuous loop startup
|
||||||
|
initial_wait = self.get_initial_wait_seconds()
|
||||||
|
if initial_wait > 0 and self.running:
|
||||||
|
logger.info(f"Sleeping for initial delay of {initial_wait:.1f}s...")
|
||||||
|
# Sleep in small steps to remain responsive to signals
|
||||||
|
step = 1.0
|
||||||
|
while initial_wait > 0 and self.running:
|
||||||
|
time.sleep(min(step, initial_wait))
|
||||||
|
initial_wait -= step
|
||||||
|
|
||||||
|
logger.info(f"Starting P2P data collector. Interval: {self.interval}s. Pairs: {self.pairs}")
|
||||||
|
|
||||||
|
while self.running:
|
||||||
|
cycle_start = now_utc()
|
||||||
|
try:
|
||||||
|
stats = self.run_single_cycle()
|
||||||
|
|
||||||
|
# Reset failure stats on success
|
||||||
|
self.consecutive_failures = 0
|
||||||
|
self.client.reset_429_backoff()
|
||||||
|
|
||||||
|
# Update checkpoint
|
||||||
|
checkpoint = read_checkpoint(self.checkpoint_path)
|
||||||
|
|
||||||
|
# Get stats for the primary pair (USDT_VES or first pair)
|
||||||
|
primary_pair_key = f"{self.pairs[0]['asset']}_{self.pairs[0]['fiat']}"
|
||||||
|
pair_stats = stats.get(primary_pair_key, {})
|
||||||
|
|
||||||
|
# Update stats in checkpoint
|
||||||
|
total_snapshots = checkpoint.get("total_snapshots", 0) + 1
|
||||||
|
first_snapshot = checkpoint.get("first_snapshot", cycle_start.isoformat())
|
||||||
|
|
||||||
|
checkpoint_data = {
|
||||||
|
"last_completed_snapshot": cycle_start.isoformat(),
|
||||||
|
"last_buy_ad_count": pair_stats.get("buy_count", 0),
|
||||||
|
"last_sell_ad_count": pair_stats.get("sell_count", 0),
|
||||||
|
"consecutive_failures": 0,
|
||||||
|
"total_snapshots": total_snapshots,
|
||||||
|
"first_snapshot": first_snapshot,
|
||||||
|
"version": "1.0"
|
||||||
|
}
|
||||||
|
write_checkpoint(self.checkpoint_path, checkpoint_data)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.consecutive_failures += 1
|
||||||
|
tb_str = traceback.format_exc()
|
||||||
|
logger.error(f"Error during collection cycle (consecutive failures: {self.consecutive_failures}): {e}")
|
||||||
|
logger.error(tb_str)
|
||||||
|
|
||||||
|
# Write alert file after threshold
|
||||||
|
alert_threshold = self.config.get("alerts", {}).get("consecutive_failure_threshold", 5)
|
||||||
|
if self.consecutive_failures >= alert_threshold:
|
||||||
|
try:
|
||||||
|
write_alert_file(
|
||||||
|
self.config,
|
||||||
|
cycle_start,
|
||||||
|
str(e),
|
||||||
|
self.consecutive_failures,
|
||||||
|
tb_str
|
||||||
|
)
|
||||||
|
except Exception as alert_error:
|
||||||
|
logger.critical(f"Failed to write alert file: {alert_error}")
|
||||||
|
|
||||||
|
# Update checkpoint with failure status
|
||||||
|
checkpoint = read_checkpoint(self.checkpoint_path)
|
||||||
|
checkpoint["consecutive_failures"] = self.consecutive_failures
|
||||||
|
write_checkpoint(self.checkpoint_path, checkpoint)
|
||||||
|
|
||||||
|
# Determine sleep duration
|
||||||
|
if self.running:
|
||||||
|
sleep_sec = jitter(self.interval)
|
||||||
|
logger.info(f"Sleeping for {sleep_sec:.1f}s before next cycle...")
|
||||||
|
|
||||||
|
# Sleep in small steps to handle signal termination cleanly
|
||||||
|
step = 1.0
|
||||||
|
while sleep_sec > 0 and self.running:
|
||||||
|
time.sleep(min(step, sleep_sec))
|
||||||
|
sleep_sec -= step
|
||||||
|
|
||||||
|
logger.info("Collector has shut down gracefully.")
|
||||||
102
p2p-collector/storage.py
Normal file
102
p2p-collector/storage.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def store_parquet(rows: list, base_dir: str, fetched_at: datetime) -> str:
|
||||||
|
"""
|
||||||
|
Stores rows of ads to a Parquet file atomically in a date-partitioned directory.
|
||||||
|
Also writes an empty _schema.parquet file for the schema reference if it doesn't exist.
|
||||||
|
Returns the path to the written final Parquet file.
|
||||||
|
"""
|
||||||
|
if not rows:
|
||||||
|
logger.warning("No rows provided to store_parquet.")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
year = fetched_at.strftime("%Y")
|
||||||
|
month = fetched_at.strftime("%m")
|
||||||
|
day = fetched_at.strftime("%d")
|
||||||
|
filename = f"snapshot_{fetched_at.strftime('%Y%m%d_%H%M%S')}.parquet"
|
||||||
|
|
||||||
|
dest_dir = Path(base_dir) / f"year={year}" / f"month={month}" / f"day={day}"
|
||||||
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
tmp_path = dest_dir / (filename + ".tmp")
|
||||||
|
final_path = dest_dir / filename
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
# Sort columns to ensure consistent schema layout
|
||||||
|
df = df.reindex(sorted(df.columns), axis=1)
|
||||||
|
|
||||||
|
# Write atomically using a temporary file
|
||||||
|
df.to_parquet(tmp_path, index=False, engine="pyarrow")
|
||||||
|
tmp_path.rename(final_path)
|
||||||
|
logger.info(f"Successfully stored snapshot to {final_path}")
|
||||||
|
|
||||||
|
# Schema consistency reference file
|
||||||
|
schema_path = dest_dir / "_schema.parquet"
|
||||||
|
if not schema_path.exists():
|
||||||
|
try:
|
||||||
|
# Write an empty dataframe with identical columns and schema
|
||||||
|
df.iloc[:0].to_parquet(schema_path, index=False, engine="pyarrow")
|
||||||
|
logger.info(f"Created schema reference file at {schema_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to create schema reference file: {e}")
|
||||||
|
|
||||||
|
return str(final_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to write parquet file {final_path}: {e}")
|
||||||
|
# Clean up tmp file if it exists
|
||||||
|
if tmp_path.exists():
|
||||||
|
try:
|
||||||
|
tmp_path.unlink()
|
||||||
|
except Exception as cleanup_error:
|
||||||
|
logger.error(f"Failed to delete temp file {tmp_path}: {cleanup_error}")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
def read_checkpoint(path: str) -> dict:
|
||||||
|
"""
|
||||||
|
Reads the checkpoint JSON file if it exists.
|
||||||
|
Returns a dictionary, or an empty dict if the file is missing or corrupted.
|
||||||
|
"""
|
||||||
|
if not os.path.exists(path):
|
||||||
|
logger.info(f"Checkpoint file {path} not found. Starting fresh.")
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
logger.info(f"Successfully read checkpoint from {path}")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Checkpoint file {path} exists but is corrupted: {e}. Starting fresh.")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def write_checkpoint(path: str, data: dict) -> None:
|
||||||
|
"""
|
||||||
|
Writes the checkpoint dictionary to the specified JSON file path.
|
||||||
|
Uses atomic write to prevent corruption.
|
||||||
|
"""
|
||||||
|
dir_path = os.path.dirname(path)
|
||||||
|
if dir_path:
|
||||||
|
os.makedirs(dir_path, exist_ok=True)
|
||||||
|
|
||||||
|
tmp_path = f"{path}.tmp"
|
||||||
|
try:
|
||||||
|
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, indent=4)
|
||||||
|
os.replace(tmp_path, path)
|
||||||
|
logger.debug(f"Checkpoint updated at {path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to write checkpoint to {path}: {e}")
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
try:
|
||||||
|
os.remove(tmp_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
107
p2p-collector/tests/test_normalizer.py
Normal file
107
p2p-collector/tests/test_normalizer.py
Normal file
|
|
@ -0,0 +1,107 @@
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
# Add the parent folder to path to import normalizer
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from normalizer import normalize_ad
|
||||||
|
|
||||||
|
class TestNormalizer(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
# Paths to sample responses
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
self.buy_json_path = os.path.join(current_dir, "..", "sample_responses", "response_buy.json")
|
||||||
|
self.sell_json_path = os.path.join(current_dir, "..", "sample_responses", "response_sell.json")
|
||||||
|
|
||||||
|
def test_normalize_buy_ad(self):
|
||||||
|
with open(self.buy_json_path, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
raw_ad = data["data"][0]
|
||||||
|
fetched_at = datetime(2026, 6, 5, 13, 30, 0, tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
normalized = normalize_ad(raw_ad, "BUY", fetched_at)
|
||||||
|
|
||||||
|
# Verify schema keys
|
||||||
|
expected_keys = {
|
||||||
|
"snapshot_id", "fetched_at", "fetched_date", "trade_type", "adv_no",
|
||||||
|
"asset", "fiat", "price", "surplus_amount", "min_amount", "max_amount",
|
||||||
|
"tradable_quantity", "advertiser_no", "advertiser_name", "advertiser_type",
|
||||||
|
"month_order_count", "month_finish_rate", "positive_rate", "user_positive_rate",
|
||||||
|
"payment_methods", "payment_method_ids", "ad_created_at", "price_type"
|
||||||
|
}
|
||||||
|
self.assertEqual(set(normalized.keys()), expected_keys)
|
||||||
|
|
||||||
|
# Verify content mapping
|
||||||
|
self.assertEqual(normalized["snapshot_id"], "20260605T133000Z_BUY")
|
||||||
|
self.assertEqual(normalized["fetched_at"], fetched_at)
|
||||||
|
self.assertEqual(normalized["fetched_date"], "2026-06-05")
|
||||||
|
self.assertEqual(normalized["trade_type"], "BUY")
|
||||||
|
self.assertEqual(normalized["adv_no"], "6f8b2e12345")
|
||||||
|
self.assertEqual(normalized["asset"], "USDT")
|
||||||
|
self.assertEqual(normalized["fiat"], "VES")
|
||||||
|
self.assertEqual(normalized["price"], 58.50)
|
||||||
|
self.assertEqual(normalized["surplus_amount"], 1520.43)
|
||||||
|
self.assertEqual(normalized["min_amount"], 100.0)
|
||||||
|
self.assertEqual(normalized["max_amount"], 5000.0)
|
||||||
|
self.assertEqual(normalized["tradable_quantity"], 1520.43)
|
||||||
|
self.assertEqual(normalized["advertiser_no"], "ABC123456")
|
||||||
|
self.assertEqual(normalized["advertiser_name"], "CryptoTraderVE")
|
||||||
|
self.assertEqual(normalized["advertiser_type"], "merchant")
|
||||||
|
self.assertEqual(normalized["month_order_count"], 342)
|
||||||
|
self.assertEqual(normalized["month_finish_rate"], 0.97)
|
||||||
|
self.assertEqual(normalized["positive_rate"], 0.99)
|
||||||
|
self.assertEqual(normalized["user_positive_rate"], 0.99)
|
||||||
|
self.assertEqual(normalized["payment_methods"], ["BANESCO", "PAGO_MOVIL"])
|
||||||
|
self.assertEqual(normalized["payment_method_ids"], ["Banco_Banesco", "Pago_Movil"])
|
||||||
|
self.assertEqual(normalized["price_type"], "FIXED")
|
||||||
|
|
||||||
|
# Verify ad creation time parsed from 1749128400000ms
|
||||||
|
expected_create_time = datetime.fromtimestamp(1749128400000 / 1000, tz=timezone.utc)
|
||||||
|
self.assertEqual(normalized["ad_created_at"], expected_create_time)
|
||||||
|
|
||||||
|
def test_normalize_sell_ad(self):
|
||||||
|
with open(self.sell_json_path, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
raw_ad = data["data"][0]
|
||||||
|
fetched_at = datetime(2026, 6, 5, 13, 30, 0, tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
normalized = normalize_ad(raw_ad, "SELL", fetched_at)
|
||||||
|
|
||||||
|
self.assertEqual(normalized["snapshot_id"], "20260605T133000Z_SELL")
|
||||||
|
self.assertEqual(normalized["trade_type"], "SELL")
|
||||||
|
self.assertEqual(normalized["adv_no"], "7a9c3d98765")
|
||||||
|
self.assertEqual(normalized["price"], 62.30)
|
||||||
|
self.assertEqual(normalized["payment_methods"], ["MERCANTIL", "PAGO_MOVIL"])
|
||||||
|
self.assertEqual(normalized["payment_method_ids"], ["Banco_Mercantil", "Pago_Movil"])
|
||||||
|
|
||||||
|
def test_defensive_handling(self):
|
||||||
|
# Test handling missing or corrupted keys
|
||||||
|
bad_raw_ad = {
|
||||||
|
"adv": {
|
||||||
|
"advNo": "bad_ad",
|
||||||
|
"price": "not_a_float",
|
||||||
|
"surplusAmount": None
|
||||||
|
},
|
||||||
|
"advertiser": {
|
||||||
|
"userNo": "bad_advertiser",
|
||||||
|
"monthOrderCount": "not_an_int"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fetched_at = datetime.now(timezone.utc)
|
||||||
|
normalized = normalize_ad(bad_raw_ad, "BUY", fetched_at)
|
||||||
|
|
||||||
|
# Should not crash, should fall back to defaults
|
||||||
|
self.assertEqual(normalized["adv_no"], "bad_ad")
|
||||||
|
self.assertEqual(normalized["price"], 0.0) # fallback
|
||||||
|
self.assertEqual(normalized["surplus_amount"], 0.0) # fallback
|
||||||
|
self.assertEqual(normalized["month_order_count"], 0) # fallback
|
||||||
|
self.assertEqual(normalized["advertiser_no"], "bad_advertiser")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
72
p2p-collector/tests/test_storage.py
Normal file
72
p2p-collector/tests/test_storage.py
Normal file
|
|
@ -0,0 +1,72 @@
|
||||||
|
import unittest
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from storage import store_parquet, read_checkpoint, write_checkpoint
|
||||||
|
|
||||||
|
class TestStorage(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.temp_dir = tempfile.TemporaryDirectory()
|
||||||
|
self.base_dir = self.temp_dir.name
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.temp_dir.cleanup()
|
||||||
|
|
||||||
|
def test_store_parquet(self):
|
||||||
|
fetched_at = datetime(2026, 6, 5, 13, 30, 0, tzinfo=timezone.utc)
|
||||||
|
rows = [
|
||||||
|
{"col1": "val1", "col2": 1.5, "col3": True},
|
||||||
|
{"col1": "val2", "col2": 2.5, "col3": False}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Write
|
||||||
|
final_path = store_parquet(rows, self.base_dir, fetched_at)
|
||||||
|
|
||||||
|
# Verify path exists
|
||||||
|
self.assertTrue(os.path.exists(final_path))
|
||||||
|
|
||||||
|
# Verify partition path structure: year=2026/month=06/day=05/snapshot_20260605_133000.parquet
|
||||||
|
expected_subdir = os.path.join(self.base_dir, "year=2026", "month=06", "day=05")
|
||||||
|
self.assertTrue(final_path.startswith(expected_subdir))
|
||||||
|
self.assertTrue(final_path.endswith("snapshot_20260605_133000.parquet"))
|
||||||
|
|
||||||
|
# Verify schema file exists
|
||||||
|
schema_path = os.path.join(expected_subdir, "_schema.parquet")
|
||||||
|
self.assertTrue(os.path.exists(schema_path))
|
||||||
|
|
||||||
|
# Read schema back and verify it's empty but has columns
|
||||||
|
df_schema = pd.read_parquet(schema_path)
|
||||||
|
self.assertEqual(len(df_schema), 0)
|
||||||
|
self.assertEqual(list(df_schema.columns), sorted(["col1", "col2", "col3"]))
|
||||||
|
|
||||||
|
# Read data back and verify content
|
||||||
|
df_data = pd.read_parquet(final_path)
|
||||||
|
self.assertEqual(len(df_data), 2)
|
||||||
|
self.assertEqual(df_data.iloc[0]["col1"], "val1")
|
||||||
|
self.assertEqual(df_data.iloc[1]["col2"], 2.5)
|
||||||
|
|
||||||
|
def test_checkpoint(self):
|
||||||
|
checkpoint_path = os.path.join(self.base_dir, "checkpoint.json")
|
||||||
|
|
||||||
|
# Test missing checkpoint
|
||||||
|
self.assertEqual(read_checkpoint(checkpoint_path), {})
|
||||||
|
|
||||||
|
# Test write and read
|
||||||
|
data = {
|
||||||
|
"last_completed_snapshot": "2026-06-05T13:30:00Z",
|
||||||
|
"last_buy_ad_count": 47,
|
||||||
|
"last_sell_ad_count": 53
|
||||||
|
}
|
||||||
|
write_checkpoint(checkpoint_path, data)
|
||||||
|
|
||||||
|
read_data = read_checkpoint(checkpoint_path)
|
||||||
|
self.assertEqual(read_data, data)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
97
p2p-collector/tests/test_validator.py
Normal file
97
p2p-collector/tests/test_validator.py
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
import unittest
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from validator import validate_row, validate_snapshot
|
||||||
|
|
||||||
|
class TestValidator(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.config = {
|
||||||
|
"validation": {
|
||||||
|
"price_min": 1.0,
|
||||||
|
"price_max": 500.0,
|
||||||
|
"reject_zero_finish_rate": True,
|
||||||
|
"reject_zero_surplus": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.valid_row = {
|
||||||
|
"adv_no": "12345",
|
||||||
|
"price": 58.50,
|
||||||
|
"surplus_amount": 100.0,
|
||||||
|
"month_finish_rate": 0.95,
|
||||||
|
"month_order_count": 100,
|
||||||
|
"payment_methods": ["BANESCO"],
|
||||||
|
"ad_created_at": datetime.now(timezone.utc)
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_validate_row_valid(self):
|
||||||
|
seen = set()
|
||||||
|
self.assertTrue(validate_row(self.valid_row, self.config, seen))
|
||||||
|
self.assertIn("12345", seen)
|
||||||
|
|
||||||
|
def test_validate_row_duplicate(self):
|
||||||
|
seen = {"12345"}
|
||||||
|
self.assertFalse(validate_row(self.valid_row, self.config, seen))
|
||||||
|
|
||||||
|
def test_validate_row_invalid_price(self):
|
||||||
|
seen = set()
|
||||||
|
|
||||||
|
# Price <= 0
|
||||||
|
row_bad_price = self.valid_row.copy()
|
||||||
|
row_bad_price["price"] = -1.0
|
||||||
|
self.assertFalse(validate_row(row_bad_price, self.config, seen))
|
||||||
|
|
||||||
|
# Price > max
|
||||||
|
row_high_price = self.valid_row.copy()
|
||||||
|
row_high_price["price"] = 1000.0
|
||||||
|
self.assertFalse(validate_row(row_high_price, self.config, seen))
|
||||||
|
|
||||||
|
def test_validate_row_zero_surplus(self):
|
||||||
|
seen = set()
|
||||||
|
row_zero_surplus = self.valid_row.copy()
|
||||||
|
row_zero_surplus["surplus_amount"] = 0.0
|
||||||
|
self.assertFalse(validate_row(row_zero_surplus, self.config, seen))
|
||||||
|
|
||||||
|
def test_validate_row_suspicious_finish(self):
|
||||||
|
seen = set()
|
||||||
|
row_suspicious = self.valid_row.copy()
|
||||||
|
row_suspicious["month_finish_rate"] = 0.0
|
||||||
|
row_suspicious["month_order_count"] = 5
|
||||||
|
self.assertFalse(validate_row(row_suspicious, self.config, seen))
|
||||||
|
|
||||||
|
def test_validate_snapshot_empty(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
validate_snapshot([], datetime.now(timezone.utc))
|
||||||
|
|
||||||
|
def test_validate_snapshot_calculations(self):
|
||||||
|
fetched_at = datetime.now(timezone.utc)
|
||||||
|
ads = [
|
||||||
|
# BUY ads
|
||||||
|
{"trade_type": "BUY", "price": 58.00, "payment_methods": ["BANESCO"], "ad_created_at": fetched_at},
|
||||||
|
{"trade_type": "BUY", "price": 59.00, "payment_methods": ["PAGO_MOVIL"], "ad_created_at": fetched_at},
|
||||||
|
# SELL ads
|
||||||
|
{"trade_type": "SELL", "price": 61.00, "payment_methods": ["MERCANTIL"], "ad_created_at": fetched_at},
|
||||||
|
{"trade_type": "SELL", "price": 62.00, "payment_methods": ["BANESCO"], "ad_created_at": fetched_at},
|
||||||
|
]
|
||||||
|
|
||||||
|
summary = validate_snapshot(ads, fetched_at)
|
||||||
|
|
||||||
|
self.assertEqual(summary["buy_count"], 2)
|
||||||
|
self.assertEqual(summary["sell_count"], 2)
|
||||||
|
self.assertEqual(summary["buy_min"], 58.00)
|
||||||
|
self.assertEqual(summary["buy_max"], 59.00)
|
||||||
|
self.assertEqual(summary["sell_min"], 61.00)
|
||||||
|
self.assertEqual(summary["sell_max"], 62.00)
|
||||||
|
self.assertEqual(summary["buy_median"], 58.50)
|
||||||
|
self.assertEqual(summary["sell_median"], 61.50)
|
||||||
|
|
||||||
|
# spread = sell_min - buy_max = 61.00 - 59.00 = 2.00
|
||||||
|
self.assertEqual(summary["spread"], 2.00)
|
||||||
|
self.assertEqual(set(summary["methods"]), {"BANESCO", "PAGO_MOVIL", "MERCANTIL"})
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
15
p2p-collector/utils.py
Normal file
15
p2p-collector/utils.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
import random
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
def jitter(interval: float) -> float:
|
||||||
|
"""
|
||||||
|
Returns a value within interval ± 10%
|
||||||
|
"""
|
||||||
|
variation = interval * 0.10
|
||||||
|
return interval + random.uniform(-variation, variation)
|
||||||
|
|
||||||
|
def now_utc() -> datetime:
|
||||||
|
"""
|
||||||
|
Returns the current UTC datetime with timezone info.
|
||||||
|
"""
|
||||||
|
return datetime.now(timezone.utc)
|
||||||
147
p2p-collector/validator.py
Normal file
147
p2p-collector/validator.py
Normal file
|
|
@ -0,0 +1,147 @@
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def validate_row(row: dict, config: dict, seen_adv_nos: set) -> bool:
|
||||||
|
"""
|
||||||
|
Validates a single normalized ad row.
|
||||||
|
Returns True if the row is valid, or False if it should be rejected.
|
||||||
|
"""
|
||||||
|
val_config = config.get("validation", {})
|
||||||
|
price_min = val_config.get("price_min", 1.0)
|
||||||
|
price_max = val_config.get("price_max", 500.0)
|
||||||
|
reject_zero_finish = val_config.get("reject_zero_finish_rate", True)
|
||||||
|
reject_zero_surplus = val_config.get("reject_zero_surplus", True)
|
||||||
|
|
||||||
|
# 1. Empty adv_no
|
||||||
|
adv_no = row.get("adv_no")
|
||||||
|
if not adv_no:
|
||||||
|
logger.error("Rejecting ad: Missing adv_no.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 2. Duplicate adv_no within same snapshot
|
||||||
|
if adv_no in seen_adv_nos:
|
||||||
|
logger.warning(f"Rejecting ad {adv_no}: Duplicate within the same snapshot.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 3. Price is None or <= 0
|
||||||
|
price = row.get("price")
|
||||||
|
if price is None or price <= 0:
|
||||||
|
logger.warning(f"Rejecting ad {adv_no}: Price is None or <= 0 ({price}).")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 4. Price outside expected range
|
||||||
|
if price < price_min or price > price_max:
|
||||||
|
logger.warning(
|
||||||
|
f"Rejecting ad {adv_no}: Price {price} is outside configured range [{price_min}, {price_max}]."
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 5. Surplus amount None or <= 0
|
||||||
|
surplus = row.get("surplus_amount")
|
||||||
|
if reject_zero_surplus and (surplus is None or surplus <= 0):
|
||||||
|
logger.debug(f"Rejecting ad {adv_no}: Surplus amount is None or <= 0 ({surplus}).")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 6. Suspicious advertiser stats: monthFinishRate is 0.0 and monthOrderCount > 0
|
||||||
|
finish_rate = row.get("month_finish_rate")
|
||||||
|
order_count = row.get("month_order_count")
|
||||||
|
if reject_zero_finish and finish_rate == 0.0 and order_count > 0:
|
||||||
|
logger.warning(
|
||||||
|
f"Rejecting ad {adv_no}: Advertiser finished 0.0% of {order_count} orders."
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
seen_adv_nos.add(adv_no)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def validate_snapshot(flat_ads: list, fetched_at: datetime) -> dict:
|
||||||
|
"""
|
||||||
|
Validates a list of all normalized and filtered ads in a single snapshot.
|
||||||
|
Raises ValueError on critical issues (like completely empty snapshot).
|
||||||
|
Returns a dictionary of summary statistics for logging/checking.
|
||||||
|
"""
|
||||||
|
buy_ads = [ad for ad in flat_ads if ad.get("trade_type") == "BUY"]
|
||||||
|
sell_ads = [ad for ad in flat_ads if ad.get("trade_type") == "SELL"]
|
||||||
|
|
||||||
|
# 1. Empty snapshot validation
|
||||||
|
if not buy_ads and not sell_ads:
|
||||||
|
raise ValueError("CRITICAL: Empty snapshot! Both BUY and SELL ad counts are 0.")
|
||||||
|
|
||||||
|
buy_count = len(buy_ads)
|
||||||
|
sell_count = len(sell_ads)
|
||||||
|
|
||||||
|
# Warnings for low/high counts
|
||||||
|
if buy_count < 20 or buy_count > 200:
|
||||||
|
logger.warning(f"Unusual BUY ad count: {buy_count} (expected 20-200).")
|
||||||
|
if sell_count < 20 or sell_count > 200:
|
||||||
|
logger.warning(f"Unusual SELL ad count: {sell_count} (expected 20-200).")
|
||||||
|
|
||||||
|
# Extract prices
|
||||||
|
buy_prices = [ad["price"] for ad in buy_ads]
|
||||||
|
sell_prices = [ad["price"] for ad in sell_ads]
|
||||||
|
|
||||||
|
buy_min = min(buy_prices) if buy_prices else 0.0
|
||||||
|
buy_max = max(buy_prices) if buy_prices else 0.0
|
||||||
|
sell_min = min(sell_prices) if sell_prices else 0.0
|
||||||
|
sell_max = max(sell_prices) if sell_prices else 0.0
|
||||||
|
|
||||||
|
# Calculate medians
|
||||||
|
buy_median = float(np.median(buy_prices)) if buy_prices else 0.0
|
||||||
|
sell_median = float(np.median(sell_prices)) if sell_prices else 0.0
|
||||||
|
|
||||||
|
# Calculate spread: SELL_min - BUY_max
|
||||||
|
# Wait, spec says: spread = SELL_min - BUY_max
|
||||||
|
spread = sell_min - buy_max if (sell_prices and buy_prices) else 0.0
|
||||||
|
|
||||||
|
# Under normal market conditions, advertisers charge a premium when they sell crypto
|
||||||
|
# to you (i.e. sell_ads: you BUY from advertiser, so you pay advertiser's SELL price).
|
||||||
|
# Wait, let's verify what the trade types mean in the spec:
|
||||||
|
# "tradeType: BUY = advertiser wants to give you VES in exchange for your USDT. They are buying USDT from you."
|
||||||
|
# So advertiser is BUYING crypto. Since they want to buy, they want to pay as little VES as possible.
|
||||||
|
# "tradeType: SELL = advertiser wants to give you USDT in exchange for your VES. They are selling USDT to you."
|
||||||
|
# So advertiser is SELLING crypto. Since they are selling, they want to receive as much VES as possible.
|
||||||
|
# Therefore, advertiser's SELL price should be higher than advertiser's BUY price.
|
||||||
|
# So SELL_min should be higher than BUY_max.
|
||||||
|
# If not (e.g., BUY_max > SELL_min), we have a negative spread or overlap.
|
||||||
|
if sell_prices and buy_prices:
|
||||||
|
if buy_max > sell_min:
|
||||||
|
logger.warning(
|
||||||
|
f"BUY/SELL price overlap detected! Max BUY price ({buy_max:.2f}) > Min SELL price ({sell_min:.2f})."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for stale ads (createTime > 7 days old)
|
||||||
|
stale_count = 0
|
||||||
|
for ad in flat_ads:
|
||||||
|
created_at = ad.get("ad_created_at")
|
||||||
|
if created_at:
|
||||||
|
age_days = (fetched_at - created_at).total_seconds() / (24 * 3600)
|
||||||
|
if age_days > 7.0:
|
||||||
|
stale_count += 1
|
||||||
|
|
||||||
|
if stale_count > 0:
|
||||||
|
logger.warning(f"Stale ads detected: {stale_count} ads were created > 7 days ago.")
|
||||||
|
|
||||||
|
# Get unique payment methods
|
||||||
|
all_methods = set()
|
||||||
|
for ad in flat_ads:
|
||||||
|
all_methods.update(ad.get("payment_methods", []))
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"buy_count": buy_count,
|
||||||
|
"sell_count": sell_count,
|
||||||
|
"buy_min": buy_min,
|
||||||
|
"buy_max": buy_max,
|
||||||
|
"sell_min": sell_min,
|
||||||
|
"sell_max": sell_max,
|
||||||
|
"buy_median": buy_median,
|
||||||
|
"sell_median": sell_median,
|
||||||
|
"spread": spread,
|
||||||
|
"stale_count": stale_count,
|
||||||
|
"methods": sorted(list(all_methods)),
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary
|
||||||
Loading…
Reference in a new issue