convert.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. # convert.py
  2. import sys, json, pickle, numpy as np, pandas as pd
  3. from pathlib import Path
  4. def json_safe(o):
  5. if isinstance(o, (str, int, float, bool)) or o is None: return o
  6. if isinstance(o, (list, tuple, set)): return [json_safe(x) for x in o]
  7. if isinstance(o, dict): return {str(k): json_safe(v) for k, v in o.items()}
  8. if isinstance(o, pd.DataFrame): return [json_safe(r) for r in o.to_dict(orient="records")]
  9. if isinstance(o, pd.Series): return json_safe(o.to_dict())
  10. if isinstance(o, np.ndarray): return json_safe(o.tolist())
  11. if isinstance(o, (np.integer,)): return int(o)
  12. if isinstance(o, (np.floating,)): return float(o)
  13. if isinstance(o, (np.bool_,)): return bool(o)
  14. return repr(o)
  15. def dict_of_lists_to_records(d):
  16. lists = {k:v for k,v in d.items() if isinstance(v, (list, tuple, np.ndarray))}
  17. if not lists: return None
  18. lens = {len(v) for v in lists.values()}
  19. if len(lens) != 1: return None
  20. n = next(iter(lens))
  21. recs = []
  22. for i in range(n):
  23. rec = {}
  24. for k,v in d.items():
  25. rec[k] = v[i] if isinstance(v, (list, tuple, np.ndarray)) else v
  26. recs.append(json_safe(rec))
  27. return recs
  28. def to_records(obj):
  29. if isinstance(obj, pd.DataFrame): return obj.to_dict(orient="records")
  30. if isinstance(obj, list) and (len(obj)==0 or isinstance(obj[0], dict)): return obj
  31. if isinstance(obj, dict):
  32. for k in ("data","results","records","runs","experiments"):
  33. if k in obj and isinstance(obj[k], list): return obj[k]
  34. recs = dict_of_lists_to_records(obj)
  35. if recs is not None: return recs
  36. return [json_safe(obj)]
  37. return [json_safe(obj)]
  38. def main(src_path, out_dir=None):
  39. src = Path(src_path)
  40. if not src.exists(): raise FileNotFoundError(src)
  41. base_dir = Path(__file__).parent
  42. out_dir = Path(out_dir) if out_dir else (base_dir / "outputs")
  43. out_dir.mkdir(parents=True, exist_ok=True)
  44. with src.open("rb") as f: obj = pickle.load(f)
  45. safe = json_safe(obj)
  46. records = to_records(obj)
  47. stem = src.stem
  48. out_array = out_dir / f"{stem}.records.json"
  49. out_ndjson = out_dir / f"{stem}.records.ndjson"
  50. out_raw = out_dir / f"{stem}.raw.json"
  51. with out_array.open("w", encoding="utf-8") as f: json.dump(json_safe(records), f, ensure_ascii=False, indent=2)
  52. with out_ndjson.open("w", encoding="utf-8") as f:
  53. for rec in records:
  54. json.dump(json_safe(rec), f, ensure_ascii=False); f.write("\n")
  55. with out_raw.open("w", encoding="utf-8") as f: json.dump(safe, f, ensure_ascii=False, indent=2)
  56. print("Wrote:\n- {}\n- {}\n- {}".format(out_array, out_ndjson, out_raw))
  57. if __name__ == "__main__":
  58. if len(sys.argv) < 2:
  59. print("Usage: python convert.py <src.pkl> [out_dir]")
  60. sys.exit(1)
  61. main(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)