#!/usr/bin/env python import os # # convert links from external to internal format # def internalize_hrefs(line): newline = '' i = line.find('href="') while (i >= 0): newline += line[0:i+6] # everything through href=" line = line[i+6:] # everything after href=" i = line.find('\"') href = line[:i] # href value line = line[i:] # everything after href if (href[0:4] != 'http') and (href[0:6] != 'mailto'): # don't modify external links j = href.find('#') if j >= 0: # replace href="xxx.yyy#zzz" with href="#zzz" href = href[j:] else: # replace href="xxx.htm" or href="xxx.html" with href="#xxx" j = href.find('.htm') if j >= 0: href = href[0:j] href = "#" + href newline += href i = line.find('href="') newline += line return newline # # open output file # outfile_basename = '#merged_' + os.path.basename(os.getcwd()) outfile = open(outfile_basename + '.htm','w', encoding='utf-8') # # copy index.htm without scripts, get chapter filenames # infile = open('index.htm','r', encoding='utf-8') lines = infile.readlines() infile.close() chapter_filenames = [] found_start = False; inside_script = False for line in lines: if line.find('= 0: inside_script = True continue if line.find('= 0: inside_script = False continue if inside_script: continue if line.find('contents_end') >= 0: break if line.find('contents_start') >= 0: found_start = True continue if found_start: outfile.write(internalize_hrefs(line)) i = line.find('href=') if i >= 0: filename = line[i+6:] i = filename.find('\"') filename = filename[:i] chapter_filenames.append(filename) else: outfile.write(line) # # copy chapter files # for filename in chapter_filenames: print ('infile: %s' % filename) infile = open(filename, 'r', encoding='utf-8') lines = infile.readlines() infile.close() found_start = False; for line in lines: if line.find('contents_end') >= 0: break if line.find('contents_start') >= 0: found_start = True continue if found_start: outfile.write(internalize_hrefs(line)) # # finish output file # outfile.write(""" """) outfile.close()