import zipfile, re, shutil, hashlib, json, subprocess, os
from pathlib import Path
import xml.etree.ElementTree as ET
import openpyxl

BAD = Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_SAFE_20260513-092323.xlsx')
BASE = Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049/wanchuan_anhui_v9_v2_1_long_asset_R29_R58_asset_class_atomic_SAFE_20260513-085849.xlsx')
ORIG = Path('/Users/neo/.hermes/cache/documents/doc_259cbd37e734_万传安徽_V9_v2.1_R66拆3sub+12sub行全拆开_20260512.xlsx')
OUTDIR = Path('/Users/neo/.hermes/audits/cashflow-asset-hook-20260512-083049')
STAMP = '20260513-094726'
OUT = OUTDIR / f'wanchuan_anhui_v9_v2_1_long_asset_R29_R169_asset_schedule_atomic_EXCEL_REPAIR_SAFE_{STAMP}.xlsx'
VERIFY = OUTDIR / f'phase11_excel_repair_root_cause_and_fix_{STAMP}.md'
SHEET = '长期资产循环_全量穿透校验'
NS={'m':'http://schemas.openxmlformats.org/spreadsheetml/2006/main','r':'http://schemas.openxmlformats.org/officeDocument/2006/relationships'}

def sha256_path(p):
    h=hashlib.sha256()
    with open(p,'rb') as f:
        for chunk in iter(lambda:f.read(1024*1024), b''):
            h.update(chunk)
    return h.hexdigest()

def locate_sheet_part(xlsx, sheet_name):
    with zipfile.ZipFile(xlsx) as z:
        wb=ET.fromstring(z.read('xl/workbook.xml'))
        rels=ET.fromstring(z.read('xl/_rels/workbook.xml.rels'))
        ridmap={r.attrib['Id']:r.attrib['Target'] for r in rels}
        for sh in wb.find('m:sheets',NS):
            if sh.attrib.get('name')==sheet_name:
                rid=sh.attrib['{'+NS['r']+'}id']
                target=ridmap[rid]
                return 'xl/'+target.lstrip('/') if not target.startswith('xl/') else target
    raise KeyError(sheet_name)

def colnum(col):
    n=0
    for ch in col:
        n=n*26+ord(ch)-64
    return n

def split(ref):
    m=re.match(r'([A-Z]+)(\d+)$',ref)
    if not m: raise ValueError(ref)
    return colnum(m.group(1)), int(m.group(2)), m.group(1)

def expand_range(rng):
    a,b=rng.split(':') if ':' in rng else (rng,rng)
    c1,r1,_=split(a); c2,r2,_=split(b)
    return c1,r1,c2,r2,a

def find_merged_ranges(xml):
    return re.findall(r'<mergeCell\b[^>]*ref="([^"]+)"', xml)

def cell_ref_from_xml(cell_xml):
    m=re.search(r'\br="([A-Z]+\d+)"', cell_xml)
    return m.group(1) if m else None

def strip_value_from_cell(cell_xml):
    # Keep style/metadata for borders, remove formula/value/inline string and any data type.
    if cell_xml.endswith('/>'):
        return cell_xml, False
    ref=cell_ref_from_xml(cell_xml)
    start=re.match(r'<c\b([^>]*)>', cell_xml, re.S)
    if not start:
        return cell_xml, False
    attrs=start.group(1)
    attrs=re.sub(r'\s+t="[^"]+"', '', attrs)
    # A cell with only style attrs is the Excel-safe representation for non-top-left merged cells.
    return '<c'+attrs+'/>', True

def merged_value_violations(xml, min_row=1, max_row=10**9):
    merged=find_merged_ranges(xml)
    ranges=[]
    for rng in merged:
        c1,r1,c2,r2,top=expand_range(rng)
        if r2<min_row or r1>max_row: continue
        ranges.append((rng,c1,r1,c2,r2,top))
    violations=[]
    cell_pat = r'<c\b(?=[^>]*\br="([A-Z]+\d+)")[^>]*/>|<c\b(?=[^>]*\br="([A-Z]+\d+)")[^>]*>.*?</c>'
    for cm in re.finditer(cell_pat, xml, re.S):
        ref=cm.group(1) or cm.group(2); c,r,_=split(ref)
        if r<min_row or r>max_row: continue
        cell=cm.group(0)
        has_payload=('<f' in cell) or ('<v' in cell) or ('<is' in cell)
        if not has_payload: continue
        for rng,c1,r1,c2,r2,top in ranges:
            tc,tr,_=split(top)
            if c1<=c<=c2 and r1<=r<=r2 and not (c==tc and r==tr):
                violations.append((ref,rng,cell[:220]))
                break
    return violations

part=locate_sheet_part(BAD,SHEET)
orig_sha_before=sha256_path(ORIG)
base_sha_before=sha256_path(BASE)
bad_sha_before=sha256_path(BAD)
with zipfile.ZipFile(BAD,'r') as zin:
    names=zin.namelist()
    xml=zin.read(part).decode('utf-8')
    before_viol=merged_value_violations(xml,64,180)
    merged=find_merged_ranges(xml)
    ranges=[]
    for rng in merged:
        c1,r1,c2,r2,top=expand_range(rng)
        if r2>=64 and r1<=180:
            tc,tr,_=split(top)
            ranges.append((rng,c1,r1,c2,r2,tc,tr))

    changed=[]
    def repl(m):
        cell=m.group(0); ref=(m.group(1) or m.group(2)); c,r,_=split(ref)
        for rng,c1,r1,c2,r2,tc,tr in ranges:
            if c1<=c<=c2 and r1<=r<=r2 and not (c==tc and r==tr):
                new, did = strip_value_from_cell(cell)
                if did and new != cell:
                    changed.append((ref,rng,cell[:220],new[:120]))
                    return new
        return cell
    cell_pat = r'<c\b(?=[^>]*\br="([A-Z]+\d+)")[^>]*/>|<c\b(?=[^>]*\br="([A-Z]+\d+)")[^>]*>.*?</c>'
    xml2=re.sub(cell_pat, repl, xml, flags=re.S)
    after_viol=merged_value_violations(xml2,64,180)
    # Sanity: XML parse
    ET.fromstring(xml2.encode('utf-8'))
    tmp=OUT.with_suffix('.tmp.xlsx')
    with zipfile.ZipFile(tmp,'w',compression=zipfile.ZIP_DEFLATED) as zout:
        for name in names:
            data=zin.read(name)
            if name==part:
                data=xml2.encode('utf-8')
            zout.writestr(zin.getinfo(name),data)
    if OUT.exists(): OUT.unlink()
    tmp.replace(OUT)

# verification
with zipfile.ZipFile(OUT) as z:
    zip_test=z.testzip()
    out_xml=z.read(part).decode('utf-8')
    out_viol=merged_value_violations(out_xml,64,180)
    shared=[]
    for m in re.finditer(r'<c\b(?=[^>]*\br="([A-Z]+)(\d+)")[^>]*>.*?<f\b[^>]*\bt="shared"[^>]*>.*?</c>', out_xml, re.S):
        row=int(m.group(2))
        if 64<=row<=180:
            shared.append(m.group(1)+m.group(2))
    calc_present='xl/calcChain.xml' in z.namelist()
    prefixes_preserved=all(x in out_xml for x in ['mc:Ignorable="x14ac xr xr2 xr3"','xmlns:x14ac=','xmlns:xr=','xmlns:xr2=','xmlns:xr3='])

# Compare zip members bad vs out and base/orig unchanged
bad_sha_after=sha256_path(BAD)
base_sha_after=sha256_path(BASE)
orig_sha_after=sha256_path(ORIG)
changed_parts=[]
with zipfile.ZipFile(BAD) as z1, zipfile.ZipFile(OUT) as z2:
    for n in sorted(set(z1.namelist())|set(z2.namelist())):
        d1=z1.read(n) if n in z1.namelist() else None
        d2=z2.read(n) if n in z2.namelist() else None
        if d1!=d2: changed_parts.append(n)

rows_check={}
load_formula_ok=load_value_ok=False
try:
    wbf=openpyxl.load_workbook(OUT,data_only=False,read_only=False,keep_links=True)
    load_formula_ok=True
except Exception as e:
    formula_err=repr(e)
try:
    wbv=openpyxl.load_workbook(OUT,data_only=True,read_only=False,keep_links=True)
    load_value_ok=True
    ws=wbv[SHEET]; wsf=wbf[SHEET] if load_formula_ok else None
    for r in [66,71,105,112,137,169,171,172,32,33,45,49,50,56,57,58]:
        rows_check[r]={
            'K': ws[f'K{r}'].value,
            'L': ws[f'L{r}'].value,
            'M': ws[f'M{r}'].value,
            'N': ws[f'N{r}'].value,
            'K_formula': wsf[f'K{r}'].value if wsf else None,
        }
except Exception as e:
    value_err=repr(e)

lo_status='SKIP'
try:
    lo_dir=OUTDIR/f'lo_phase11_{STAMP}'
    if lo_dir.exists(): shutil.rmtree(lo_dir)
    lo_dir.mkdir(parents=True,exist_ok=True)
    soffice=shutil.which('soffice') or shutil.which('libreoffice') or '/Applications/LibreOffice.app/Contents/MacOS/soffice'
    if soffice and Path(soffice).exists():
        cp=subprocess.run([soffice,'--headless','--convert-to','xlsx','--outdir',str(lo_dir),str(OUT)],stdout=subprocess.PIPE,stderr=subprocess.PIPE,text=True,timeout=120)
        lo_status=f'exit={cp.returncode}; stdout={cp.stdout.strip()}; stderr={cp.stderr.strip()}'
        cands=list(lo_dir.glob('*.xlsx'))
        if cands:
            with zipfile.ZipFile(cands[0]) as z:
                lo_status += f'; roundtrip_zip_test={z.testzip()}'
    else:
        lo_status='SKIP: soffice not found'
except Exception as e:
    lo_status='ERROR: '+repr(e)

main_rows=[66,71,105,112,137,169,171,172]
main_rows_pass=all(rows_check.get(r,{}).get('N')=='✓' and abs(float(rows_check.get(r,{}).get('M') or 0))<0.001 for r in main_rows)
verify={
    'bad_input':str(BAD),'output':str(OUT),'verify':str(VERIFY),'worksheet_part':part,
    'root_cause':'Phase10 inserted inlineStr dash/value cells into non-top-left cells of merged rows (A92:N92, A126:N126, A147:N147, A159:N159). Excel repairs merged ranges by removing those cell records; openpyxl/LibreOffice/zip tests do not catch this.',
    'merged_value_violations_before_count':len(before_viol),'merged_value_violations_before':before_viol,
    'sanitized_cells_count':len(changed),'sanitized_cells':changed,
    'merged_value_violations_after_count':len(out_viol),'merged_value_violations_after':out_viol,
    'zip_testzip':zip_test,'calcChain_present':calc_present,'prefixes_preserved':prefixes_preserved,
    'shared_formula_cells_R64_R180':shared,'changed_parts':changed_parts,
    'orig_sha_before_after_same':orig_sha_before==orig_sha_after,
    'base_sha_before_after_same':base_sha_before==base_sha_after,
    'bad_input_sha_before_after_same':bad_sha_before==bad_sha_after,
    'openpyxl_formula_ok':load_formula_ok,'openpyxl_value_ok':load_value_ok,
    'main_rows_pass':main_rows_pass,'rows':rows_check,
    'libreoffice_smoke':lo_status,
}
lines=[]
lines.append(f'# Phase 11 Excel repair root cause and fixed workbook ({STAMP})')
lines.append('')
lines.append('## Root cause')
lines.append('Excel 报 `Removed Records: Cell information from /xl/worksheets/sheet26.xml` 的根因不是 zip/XML 损坏，也不是 calcChain/namespace；这次是 **merged range 非左上角单元格被写入了带内容的 cell record**。')
lines.append('')
lines.append('Phase 10 为了把 `L/M/N` 的 shared formula/空值统一成显式 `—`，用了一个批量循环，覆盖了若干整行合并区：')
lines.append('- `A92:N92`')
lines.append('- `A126:N126`')
lines.append('- `A147:N147`')
lines.append('- `A159:N159`')
lines.append('')
lines.append('这些合并区里只有左上角 `A*` 可以有内容；`M92/N92/M126/N126/M147/N147/M159/N159` 被插入 `<c ... t="inlineStr"><is><t>—</t></is></c>` 后，Excel 会认为合并区域内部有非法 cell information，于是打开时修复并删除记录。`openpyxl`、`zipfile.testzip()`、LibreOffice 都不会报这个错误，所以之前的验证漏掉了。')
lines.append('')
lines.append('## Fix applied')
lines.append('- 输出修复版：`'+str(OUT)+'`')
lines.append('- 仅编辑 `xl/worksheets/sheet26.xml`。')
lines.append('- 对 `R64:R180` 范围内所有合并区的非左上角单元格，保留样式/边框但剥离 `<f>/<v>/<is>` 内容和 `t="..."` 类型，变为 Excel-safe 的样式空 cell 或无内容 cell。')
lines.append('- 不改源文件、不改 Phase 10 输入文件、不改前面 R29:R58 的公式逻辑。')
lines.append('')
lines.append('## Verification')
for k in ['merged_value_violations_before_count','sanitized_cells_count','merged_value_violations_after_count','zip_testzip','calcChain_present','prefixes_preserved','shared_formula_cells_R64_R180','changed_parts','orig_sha_before_after_same','base_sha_before_after_same','bad_input_sha_before_after_same','openpyxl_formula_ok','openpyxl_value_ok','main_rows_pass','libreoffice_smoke']:
    lines.append(f'- `{k}`: `{verify[k]}`')
lines.append('')
lines.append('## Main row closure still intact')
lines.append('| Row | K | L | M | N |')
lines.append('|---:|---:|---:|---:|:--:|')
for r in [66,71,105,112,137,169,171,172,32,33,45,49,50,56,57,58]:
    rc=rows_check.get(r,{})
    lines.append(f"| {r} | {rc.get('K')} | {rc.get('L')} | {rc.get('M')} | {rc.get('N')} |")
lines.append('')
lines.append('## Prevention rule')
lines.append('以后 OOXML 外科式写入 cell 前，必须先读取 `<mergeCells>`：若目标 cell 落在合并区且不是左上角，不能写公式/值/inlineStr；若需要占位，只能保持样式空 cell，或者写到合并区左上角。验证也必须加入 “merged non-top-left payload = 0” 检查。')
lines.append('')
lines.append('## JSON')
lines.append('```json')
lines.append(json.dumps(verify,ensure_ascii=False,indent=2,default=str))
lines.append('```')
VERIFY.write_text('\n'.join(lines),encoding='utf-8')
print(json.dumps(verify,ensure_ascii=False,indent=2,default=str))
