from pathlib import Path
import zipfile, hashlib, re, json, xml.etree.ElementTree as ET
from collections import Counter

UP = Path('/Users/neo/.hermes/cache/documents/doc_1d136388d7ce_wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_EXCEL_REPAIR_SAFE_20260513-094726v1.xlsx')
PH11 = Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_EXCEL_REPAIR_SAFE_20260513-094726.xlsx')
PH12 = Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_EXCEL_STRICT_SAFE_20260513-102004.xlsx')
BASE = Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R58_asset_class_atomic_SAFE_20260513-085849.xlsx')
PH10 = Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_SAFE_20260513-092323.xlsx')

NS = {'m':'http://schemas.openxmlformats.org/spreadsheetml/2006/main','r':'http://schemas.openxmlformats.org/officeDocument/2006/relationships'}
CELL_RE = re.compile(rb'<c\b[^>]*\br="([A-Z]+[0-9]+)"[^>]*/>|<c\b[^>]*\br="([A-Z]+[0-9]+)"[^>]*>.*?</c>', re.S)

def sha(p):
    return hashlib.sha256(p.read_bytes()).hexdigest() if p.exists() else None

def zip_info(p):
    with zipfile.ZipFile(p) as z:
        names=z.namelist()
        return {'exists':p.exists(),'size':p.stat().st_size,'sha256':sha(p),'testzip':z.testzip(),'parts':len(names),'comments':sum('comments' in n.lower() for n in names),'vml':sum(n.endswith('.vml') for n in names),'calcChain':'xl/calcChain.xml' in names}

def locate_sheet_part(p, sheet_name='长期资产循环_全量穿透校验'):
    with zipfile.ZipFile(p) as z:
        wb=ET.fromstring(z.read('xl/workbook.xml'))
        rels=ET.fromstring(z.read('xl/_rels/workbook.xml.rels'))
        rid_to_target={rel.attrib['Id']:rel.attrib['Target'] for rel in rels}
        for sh in wb.find('m:sheets',NS):
            if sh.attrib.get('name')==sheet_name:
                rid=sh.attrib['{'+NS['r']+'}id']
                target=rid_to_target[rid]
                return 'xl/'+target.lstrip('/') if not target.startswith('xl/') else target
    raise KeyError(sheet_name)

def read_part(p, part):
    with zipfile.ZipFile(p) as z:
        return z.read(part)

def part_hashes(p):
    with zipfile.ZipFile(p) as z:
        return {n:hashlib.sha256(z.read(n)).hexdigest() for n in z.namelist()}

def cell_fragments(xml_bytes):
    d={}
    for m in CELL_RE.finditer(xml_bytes):
        ref=(m.group(1) or m.group(2)).decode()
        d[ref]=m.group(0)
    return d

def colrow(ref):
    m=re.match(r'([A-Z]+)(\d+)$',ref)
    col=m.group(1); row=int(m.group(2))
    n=0
    for ch in col:
        n=n*26+ord(ch)-64
    return n,row

def in_range(ref, rng):
    a,b=rng.split(':')
    c,r=colrow(ref); c1,r1=colrow(a); c2,r2=colrow(b)
    return c1<=c<=c2 and r1<=r<=r2

def top_left(rng): return rng.split(':')[0]

def merged_ranges(xml_bytes):
    root=ET.fromstring(xml_bytes)
    ranges=[]
    for mc in root.findall('.//m:mergeCell',NS):
        ranges.append(mc.attrib['ref'])
    return ranges

def merged_follower_refs(xml_bytes, row_min=None, row_max=None):
    refs=[]
    for rng in merged_ranges(xml_bytes):
        tl=top_left(rng)
        c1,r1=colrow(rng.split(':')[0]); c2,r2=colrow(rng.split(':')[1])
        for r in range(r1,r2+1):
            for c in range(c1,c2+1):
                # convert col
                x=c; s=''
                while x:
                    x,rem=divmod(x-1,26); s=chr(65+rem)+s
                ref=f'{s}{r}'
                if ref==tl: continue
                if row_min is not None and not (row_min<=r<=row_max): continue
                refs.append((ref,rng))
    return refs

def payload_violations(xml_bytes):
    frags=cell_fragments(xml_bytes)
    out=[]
    for ref,rng in merged_follower_refs(xml_bytes):
        frag=frags.get(ref,b'')
        if b'<f' in frag or b'<v' in frag or b'<is' in frag:
            out.append({'ref':ref,'range':rng,'frag':frag.decode('utf-8','replace')[:200]})
    return out

def dup_cells(xml_bytes):
    refs=[]
    for m in CELL_RE.finditer(xml_bytes):
        refs.append((m.group(1) or m.group(2)).decode())
    c=Counter(refs)
    return [k for k,v in c.items() if v>1]

def compare_parts(p1,p2):
    h1=part_hashes(p1); h2=part_hashes(p2)
    return {'only_1':sorted(set(h1)-set(h2)),'only_2':sorted(set(h2)-set(h1)),'changed':sorted(n for n in set(h1)&set(h2) if h1[n]!=h2[n])}

def follower_diffs(p, base=BASE, row_min=1, row_max=1048576):
    part=locate_sheet_part(p); bpart=locate_sheet_part(base)
    xb=read_part(base,bpart); xp=read_part(p,part)
    fb=cell_fragments(xb); fp=cell_fragments(xp)
    diffs=[]
    for ref,rng in merged_follower_refs(xb,row_min,row_max):
        if fb.get(ref,b'') != fp.get(ref,b''):
            diffs.append({'ref':ref,'range':rng,'base':fb.get(ref,b'').decode('utf-8','replace'), 'other':fp.get(ref,b'').decode('utf-8','replace')})
    return diffs

def changed_cells_between(p1,p2, row_min=1, row_max=1048576):
    part1=locate_sheet_part(p1); part2=locate_sheet_part(p2)
    f1=cell_fragments(read_part(p1,part1)); f2=cell_fragments(read_part(p2,part2))
    refs=sorted(set(f1)|set(f2), key=lambda r: colrow(r)[1]*1000+colrow(r)[0])
    out=[]
    for ref in refs:
        c,r=colrow(ref)
        if not (row_min<=r<=row_max): continue
        if f1.get(ref,b'')!=f2.get(ref,b''):
            out.append({'ref':ref,'a':f1.get(ref,b'').decode('utf-8','replace')[:300], 'b':f2.get(ref,b'').decode('utf-8','replace')[:300]})
    return out

paths={'uploaded':UP,'phase10':PH10,'phase11':PH11,'phase12':PH12,'base':BASE}
res={'zip_info':{k:zip_info(p) for k,p in paths.items() if p.exists()}}
res['sheet_parts']={k:locate_sheet_part(p) for k,p in paths.items() if p.exists()}
# compare uploaded package to versions
for name,p in [('phase11',PH11),('phase12',PH12),('phase10',PH10),('base',BASE)]:
    if p.exists():
        cmp=compare_parts(UP,p)
        res[f'uploaded_vs_{name}_parts']={k:(v[:20] if isinstance(v,list) else v) for k,v in cmp.items()}
        res[f'uploaded_vs_{name}_changed_count']=len(cmp['changed'])
# sheet analysis
upxml=read_part(UP,locate_sheet_part(UP))
res['uploaded_sheet26']={
    'duplicate_cells_count':len(dup_cells(upxml)),
    'duplicate_cells':dup_cells(upxml)[:20],
    'merged_payload_violations_count':len(payload_violations(upxml)),
    'merged_payload_violations_sample':payload_violations(upxml)[:20],
    'merged_ranges_count':len(merged_ranges(upxml)),
}
for label,p in [('uploaded',UP),('phase11',PH11),('phase12',PH12),('phase10',PH10)]:
    if p.exists():
        diffs=follower_diffs(p,BASE,64,180)
        res[f'{label}_merged_follower_diffs_vs_base_R64_R180_count']=len(diffs)
        res[f'{label}_merged_follower_diffs_vs_base_R64_R180_sample']=diffs[:20]
# changes uploaded vs phase11/12 in target rows
res['uploaded_vs_phase11_changed_cells_R64_R180_count']=len(changed_cells_between(UP,PH11,64,180)) if PH11.exists() else None
res['uploaded_vs_phase11_changed_cells_R64_R180_sample']=changed_cells_between(UP,PH11,64,180)[:50] if PH11.exists() else []
res['uploaded_vs_phase12_changed_cells_R64_R180_count']=len(changed_cells_between(UP,PH12,64,180)) if PH12.exists() else None
res['uploaded_vs_phase12_changed_cells_R64_R180_sample']=changed_cells_between(UP,PH12,64,180)[:50] if PH12.exists() else []

out=Path('/tmp/analyze_uploaded_repaired_xlsx.json')
out.write_text(json.dumps(res,ensure_ascii=False,indent=2),encoding='utf-8')
print(json.dumps(res,ensure_ascii=False,indent=2)[:20000])
print('\nJSON',out)
