import zipfile, re, xml.etree.ElementTree as ET, collections, json
from pathlib import Path
paths = [
Path('/Users/neo/Downloads/wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_SAFE_20260513-092323.xlsx'),
Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_SAFE_20260513-092323.xlsx'),
Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R58_asset_class_atomic_SAFE_20260513-085849.xlsx')
]
SHEET='长期资产循环_全量穿透校验'
NS={'m':'http://schemas.openxmlformats.org/spreadsheetml/2006/main','r':'http://schemas.openxmlformats.org/officeDocument/2006/relationships'}
def locate(z):
    wb=ET.fromstring(z.read('xl/workbook.xml'))
    rels=ET.fromstring(z.read('xl/_rels/workbook.xml.rels'))
    ridmap={r.attrib['Id']:r.attrib['Target'] for r in rels}
    for sh in wb.find('m:sheets',NS):
        if sh.attrib.get('name')==SHEET:
            return 'xl/'+ridmap[sh.attrib['{'+NS['r']+'}id']].lstrip('/')
    raise KeyError(SHEET)

def analyze(p):
    print('\n====',p,'exists',p.exists(),'====')
    if not p.exists(): return
    with zipfile.ZipFile(p) as z:
        print('testzip', z.testzip())
        part=locate(z); print('part', part)
        data=z.read(part)
        xml=data.decode('utf-8')
        try:
            ET.fromstring(data); print('xml_parse ok')
        except Exception as e:
            print('xml_parse error',repr(e))
        refs=re.findall(r'<c\b[^>]*\br="([A-Z]+\d+)"', xml)
        cnt=collections.Counter(refs)
        dups=[(r,n) for r,n in cnt.items() if n>1]
        print('cell_count',len(refs),'unique',len(cnt),'dups',len(dups),dups[:50])
        # row-level duplicate/order check and cells with illegal formula+inlineStr or invalid t
        bad_order=[]; bad_combo=[]; bad_t=[]; weird_formula=[]
        def colnum(ref):
            m=re.match(r'([A-Z]+)(\d+)$',ref); n=0
            for ch in m.group(1): n=n*26+ord(ch)-64
            return n
        for rm in re.finditer(r'<row\b[^>]*\br="(\d+)"[^>]*>(.*?)</row>', xml, re.S):
            r=int(rm.group(1)); body=rm.group(2)
            rowrefs=re.findall(r'<c\b[^>]*\br="([A-Z]+%d)"' % r, body)
            nums=[colnum(x) for x in rowrefs]
            if nums != sorted(nums): bad_order.append((r,rowrefs))
            for cm in re.finditer(r'<c\b([^>]*)>(.*?)</c>|<c\b([^>]*)/>', body, re.S):
                attrs=(cm.group(1) or cm.group(3) or '')
                cell=cm.group(0)
                rr=re.search(r'\br="([A-Z]+\d+)"', attrs)
                ref=rr.group(1) if rr else '?'
                tm=re.search(r'\bt="([^"]+)"', attrs)
                t=tm.group(1) if tm else None
                if t and t not in ['b','d','e','inlineStr','n','s','str']:
                    bad_t.append((ref,t,cell[:120]))
                if t=='inlineStr' and '<f' in cell:
                    bad_combo.append((ref,'inlineStr+formula',cell[:200]))
                if '<f' in cell:
                    # formula text checks
                    fm=re.search(r'<f\b[^>]*>(.*?)</f>|<f\b[^>]*/>', cell, re.S)
                    ftxt=(fm.group(1) if fm and fm.group(1) else '')
                    if ftxt.startswith('=') or '&' in ftxt and not re.search(r'&(amp|lt|gt|quot|apos|#\d+|#x[0-9A-Fa-f]+);', ftxt):
                        weird_formula.append((ref, ftxt[:200], cell[:240]))
        print('bad_order',len(bad_order),bad_order[:10])
        print('bad_combo',len(bad_combo),bad_combo[:10])
        print('bad_t',len(bad_t),bad_t[:10])
        print('weird_formula',len(weird_formula),weird_formula[:10])
        # cells in changed region with t attr and formula/value
        print('sample formula string cells N64:N180')
        for ref in ['N71','N105','N112','N137','N169','N172']:
            m=re.search(r'<c\b(?=[^>]*\br="'+ref+r'")[^>]*(?:/>|>.*?</c>)', xml, re.S)
            print(ref, m.group(0)[:300] if m else 'missing')
for p in paths: analyze(p)
