Set up continuous P2P VES/USDT market history data collection, normalization, validation, and date-partitioned Parquet storage.
147 lines
5.7 KiB
Python
147 lines
5.7 KiB
Python
import logging
|
|
from datetime import datetime, timezone
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def validate_row(row: dict, config: dict, seen_adv_nos: set) -> bool:
|
|
"""
|
|
Validates a single normalized ad row.
|
|
Returns True if the row is valid, or False if it should be rejected.
|
|
"""
|
|
val_config = config.get("validation", {})
|
|
price_min = val_config.get("price_min", 1.0)
|
|
price_max = val_config.get("price_max", 500.0)
|
|
reject_zero_finish = val_config.get("reject_zero_finish_rate", True)
|
|
reject_zero_surplus = val_config.get("reject_zero_surplus", True)
|
|
|
|
# 1. Empty adv_no
|
|
adv_no = row.get("adv_no")
|
|
if not adv_no:
|
|
logger.error("Rejecting ad: Missing adv_no.")
|
|
return False
|
|
|
|
# 2. Duplicate adv_no within same snapshot
|
|
if adv_no in seen_adv_nos:
|
|
logger.warning(f"Rejecting ad {adv_no}: Duplicate within the same snapshot.")
|
|
return False
|
|
|
|
# 3. Price is None or <= 0
|
|
price = row.get("price")
|
|
if price is None or price <= 0:
|
|
logger.warning(f"Rejecting ad {adv_no}: Price is None or <= 0 ({price}).")
|
|
return False
|
|
|
|
# 4. Price outside expected range
|
|
if price < price_min or price > price_max:
|
|
logger.warning(
|
|
f"Rejecting ad {adv_no}: Price {price} is outside configured range [{price_min}, {price_max}]."
|
|
)
|
|
return False
|
|
|
|
# 5. Surplus amount None or <= 0
|
|
surplus = row.get("surplus_amount")
|
|
if reject_zero_surplus and (surplus is None or surplus <= 0):
|
|
logger.debug(f"Rejecting ad {adv_no}: Surplus amount is None or <= 0 ({surplus}).")
|
|
return False
|
|
|
|
# 6. Suspicious advertiser stats: monthFinishRate is 0.0 and monthOrderCount > 0
|
|
finish_rate = row.get("month_finish_rate")
|
|
order_count = row.get("month_order_count")
|
|
if reject_zero_finish and finish_rate == 0.0 and order_count > 0:
|
|
logger.warning(
|
|
f"Rejecting ad {adv_no}: Advertiser finished 0.0% of {order_count} orders."
|
|
)
|
|
return False
|
|
|
|
seen_adv_nos.add(adv_no)
|
|
return True
|
|
|
|
|
|
def validate_snapshot(flat_ads: list, fetched_at: datetime) -> dict:
|
|
"""
|
|
Validates a list of all normalized and filtered ads in a single snapshot.
|
|
Raises ValueError on critical issues (like completely empty snapshot).
|
|
Returns a dictionary of summary statistics for logging/checking.
|
|
"""
|
|
buy_ads = [ad for ad in flat_ads if ad.get("trade_type") == "BUY"]
|
|
sell_ads = [ad for ad in flat_ads if ad.get("trade_type") == "SELL"]
|
|
|
|
# 1. Empty snapshot validation
|
|
if not buy_ads and not sell_ads:
|
|
raise ValueError("CRITICAL: Empty snapshot! Both BUY and SELL ad counts are 0.")
|
|
|
|
buy_count = len(buy_ads)
|
|
sell_count = len(sell_ads)
|
|
|
|
# Warnings for low/high counts
|
|
if buy_count < 20 or buy_count > 200:
|
|
logger.warning(f"Unusual BUY ad count: {buy_count} (expected 20-200).")
|
|
if sell_count < 20 or sell_count > 200:
|
|
logger.warning(f"Unusual SELL ad count: {sell_count} (expected 20-200).")
|
|
|
|
# Extract prices
|
|
buy_prices = [ad["price"] for ad in buy_ads]
|
|
sell_prices = [ad["price"] for ad in sell_ads]
|
|
|
|
buy_min = min(buy_prices) if buy_prices else 0.0
|
|
buy_max = max(buy_prices) if buy_prices else 0.0
|
|
sell_min = min(sell_prices) if sell_prices else 0.0
|
|
sell_max = max(sell_prices) if sell_prices else 0.0
|
|
|
|
# Calculate medians
|
|
buy_median = float(np.median(buy_prices)) if buy_prices else 0.0
|
|
sell_median = float(np.median(sell_prices)) if sell_prices else 0.0
|
|
|
|
# Calculate spread: SELL_min - BUY_max
|
|
# Wait, spec says: spread = SELL_min - BUY_max
|
|
spread = sell_min - buy_max if (sell_prices and buy_prices) else 0.0
|
|
|
|
# Under normal market conditions, advertisers charge a premium when they sell crypto
|
|
# to you (i.e. sell_ads: you BUY from advertiser, so you pay advertiser's SELL price).
|
|
# Wait, let's verify what the trade types mean in the spec:
|
|
# "tradeType: BUY = advertiser wants to give you VES in exchange for your USDT. They are buying USDT from you."
|
|
# So advertiser is BUYING crypto. Since they want to buy, they want to pay as little VES as possible.
|
|
# "tradeType: SELL = advertiser wants to give you USDT in exchange for your VES. They are selling USDT to you."
|
|
# So advertiser is SELLING crypto. Since they are selling, they want to receive as much VES as possible.
|
|
# Therefore, advertiser's SELL price should be higher than advertiser's BUY price.
|
|
# So SELL_min should be higher than BUY_max.
|
|
# If not (e.g., BUY_max > SELL_min), we have a negative spread or overlap.
|
|
if sell_prices and buy_prices:
|
|
if buy_max > sell_min:
|
|
logger.warning(
|
|
f"BUY/SELL price overlap detected! Max BUY price ({buy_max:.2f}) > Min SELL price ({sell_min:.2f})."
|
|
)
|
|
|
|
# Check for stale ads (createTime > 7 days old)
|
|
stale_count = 0
|
|
for ad in flat_ads:
|
|
created_at = ad.get("ad_created_at")
|
|
if created_at:
|
|
age_days = (fetched_at - created_at).total_seconds() / (24 * 3600)
|
|
if age_days > 7.0:
|
|
stale_count += 1
|
|
|
|
if stale_count > 0:
|
|
logger.warning(f"Stale ads detected: {stale_count} ads were created > 7 days ago.")
|
|
|
|
# Get unique payment methods
|
|
all_methods = set()
|
|
for ad in flat_ads:
|
|
all_methods.update(ad.get("payment_methods", []))
|
|
|
|
summary = {
|
|
"buy_count": buy_count,
|
|
"sell_count": sell_count,
|
|
"buy_min": buy_min,
|
|
"buy_max": buy_max,
|
|
"sell_min": sell_min,
|
|
"sell_max": sell_max,
|
|
"buy_median": buy_median,
|
|
"sell_median": sell_median,
|
|
"spread": spread,
|
|
"stale_count": stale_count,
|
|
"methods": sorted(list(all_methods)),
|
|
}
|
|
|
|
return summary
|