4.2 Find SHA-1 hash collisions forTwo PDF documents

Expert Solution

To generate two PDFs with different contents but identical SHA1 hashes.

PDFs are rendered into JPGs and merged into the output file. They must have the same page size and page count.

Requires ghostscript, turbojpeg, PIL, and Python 3.

Uses the "shattered" PDF prologue from shattered.io - credit to Marc Stevens et al. for the collision.

from hashlib import sha1, sha256
	import tempfile
	import subprocess
	import os
	import sys
	import shutil
	import itertools
	import logging
	import re
	from PIL import Image

	logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s', datefmt='[%H:%M:%S]')
	logger = logging

	prefix1 = bytes.fromhex("25 50 44 46 2D 31 2E 33 0A 25 E2 E3 CF D3 0A 0A 0A 31 20 30 20 6F 62 6A 0A 3C 3C 2F 57 69 64 74 68 20 32 20 30 20 52 2F 48 65 69 67 68 74 20 33 20 30 20 52 2F 54 79 70 65 20 34 20 30 20 52 2F 53 75 62 74 79 70 65 20 35 20 30 20 52 2F 46 69 6C 74 65 72 20 36 20 30 20 52 2F 43 6F 6C 6F 72 53 70 61 63 65 20 37 20 30 20 52 2F 4C 65 6E 67 74 68 20 38 20 30 20 52 2F 42 69 74 73 50 65 72 43 6F 6D 70 6F 6E 65 6E 74 20 38 3E 3E 0A 73 74 72 65 61 6D 0A FF D8 FF FE 00 24 53 48 41 2D 31 20 69 73 20 64 65 61 64 21 21 21 21 21 85 2F EC 09 23 39 75 9C 39 B1 A1 C6 3C 4C 97 E1 FF FE 01 73 46 DC 91 66 B6 7E 11 8F 02 9A B6 21 B2 56 0F F9 CA 67 CC A8 C7 F8 5B A8 4C 79 03 0C 2B 3D E2 18 F8 6D B3 A9 09 01 D5 DF 45 C1 4F 26 FE DF B3 DC 38 E9 6A C2 2F E7 BD 72 8F 0E 45 BC E0 46 D2 3C 57 0F EB 14 13 98 BB 55 2E F5 A0 A8 2B E3 31 FE A4 80 37 B8 B5 D7 1F 0E 33 2E DF 93 AC 35 00 EB 4D DC 0D EC C1 A8 64 79 0C 78 2C 76 21 56 60 DD 30 97 91 D0 6B D0 AF 3F 98 CD A4 BC 46 29 B1")
	prefix2 = bytes.fromhex("25 50 44 46 2D 31 2E 33 0A 25 E2 E3 CF D3 0A 0A 0A 31 20 30 20 6F 62 6A 0A 3C 3C 2F 57 69 64 74 68 20 32 20 30 20 52 2F 48 65 69 67 68 74 20 33 20 30 20 52 2F 54 79 70 65 20 34 20 30 20 52 2F 53 75 62 74 79 70 65 20 35 20 30 20 52 2F 46 69 6C 74 65 72 20 36 20 30 20 52 2F 43 6F 6C 6F 72 53 70 61 63 65 20 37 20 30 20 52 2F 4C 65 6E 67 74 68 20 38 20 30 20 52 2F 42 69 74 73 50 65 72 43 6F 6D 70 6F 6E 65 6E 74 20 38 3E 3E 0A 73 74 72 65 61 6D 0A FF D8 FF FE 00 24 53 48 41 2D 31 20 69 73 20 64 65 61 64 21 21 21 21 21 85 2F EC 09 23 39 75 9C 39 B1 A1 C6 3C 4C 97 E1 FF FE 01 7F 46 DC 93 A6 B6 7E 01 3B 02 9A AA 1D B2 56 0B 45 CA 67 D6 88 C7 F8 4B 8C 4C 79 1F E0 2B 3D F6 14 F8 6D B1 69 09 01 C5 6B 45 C1 53 0A FE DF B7 60 38 E9 72 72 2F E7 AD 72 8F 0E 49 04 E0 46 C2 30 57 0F E9 D4 13 98 AB E1 2E F5 BC 94 2B E3 35 42 A4 80 2D 98 B5 D7 0F 2A 33 2E C3 7F AC 35 14 E7 4D DC 0F 2C C1 A8 74 CD 0C 78 30 5A 21 56 64 61 30 97 89 60 6B D0 BF 3F 98 CD A8 04 46 29 A1")
	jpeg1 = prefix1[149:]
	jpeg2 = prefix2[149:]

	assert sha1(prefix1).hexdigest() == sha1(prefix2).hexdigest()

	class SimpleObject:
	def __init__(self, data):
	self.data = data
	def serialize(self, outfile):
	outfile.write(str(self.data).encode())

	class MagicJPEGObject:
	def __init__(self, jpegdata):
	self.data = jpegdata
	def add_properties(self, jpegsize, pdf):
	# width
	pdf.add_object(SimpleObject(jpegsize[0]))
	# height
	pdf.add_object(SimpleObject(jpegsize[1]))
	pdf.add_object(SimpleObject('/XObject'))
	pdf.add_object(SimpleObject('/Image'))
	pdf.add_object(SimpleObject('/DCTDecode'))
	pdf.add_object(SimpleObject('/DeviceRGB'))
	pdf.add_object(SimpleObject(len(self.data)))
	def serialize(self, outfile):
	# format fixed by the SHA1 collision header
	outfile.write(b'<</Width 2 0 R/Height 3 0 R/Type 4 0 R/Subtype 5 0 R/Filter 6 0 R/ColorSpace 7 0 R/Length 8 0 R/BitsPerComponent 8>>\n')
	outfile.write(b'stream\n')
	outfile.write(self.data)
	outfile.write(b'\nendstream')

	class StreamObject:
	def __init__(self, data):
	self.data = data
	def serialize(self, outfile):
	outfile.write(b'<</Length %d>>\n' % len(self.data))
	outfile.write(b'stream\n')
	outfile.write(self.data)
	outfile.write(b'\nendstream')

	class PagesObject:
	def __init__(self):
	self.pages = []
	def add_page(self, page_oid):
	self.pages.append(page_oid)
	def serialize(self, outfile):
	outfile.write(b'<</Type /Pages /Count %d /Kids [%s]>>' % (
	len(self.pages), b' '.join(b'%d 0 R' % pid for pid in self.pages)))

	class PDFGenerator:
	def __init__(self):
	self.objects = [None]
	self.rootid = 0

	def add_object(self, obj):
	oid = len(self.objects)
	self.objects.append(obj)
	return oid

	def set_root(self, oid):
	self.rootid = oid

	def serialize(self, outfile):
	outfile.write(b'%PDF-1.3\n'
	b'%\xe2\xe3\xcf\xd3\n'
	b'\n')

	xref = [(0, 65535, b'f')]
	for oid in range(1, len(self.objects)):
	xref.append((outfile.tell(), 0, b'n'))
	outfile.write(b'\n%d 0 obj\n' % oid)
	self.objects[oid].serialize(outfile)
	outfile.write(b'\nendobj\n')
	outfile.write(b'\n\n')

	xrefpos = outfile.tell()
	outfile.write(b'xref\n')
	outfile.write(b'%d %d\n' % (0, len(xref)))
	for offs, gen, status in xref:
	outfile.write(b'%010d %05d %c \n' % (offs, gen, status))
	outfile.write(b'\n')

	outfile.write(b'trailer << /Root %d 0 R /Size %d >>\n' % (self.rootid, len(xref)))
	outfile.write(b'\nstartxref\n')
	outfile.write(b'%d\n' % xrefpos)
	outfile.write(b'%%EOF\n')

	def add_image_page(pdf, pages_oid, imw, imh, cropx, cropy, cropw, croph, pagew, pageh):
	'''
	Add a page to the PDF that is [pagew x pageh] PDF units in size,
	and which shows /Im0 in the desired crop region
	'''

	content_oid = pdf.add_object(StreamObject(b'''
	q
	%.8f 0 0 %.8f 0 0 cm
	1 0 0 1 %d %d cm
	%d 0 0 %d 0 0 cm
	/Im0 Do
	Q''' % (pagew / cropw, pageh / croph,
	-cropx, cropy + croph - imh,
	imw, imh)))
	page_oid = pdf.add_object(SimpleObject('''<<
	/Type /Page
	/Parent %(parent)d 0 R
	/MediaBox [0 0 %(w)d %(h)d]
	/CropBox [0 0 %(w)d %(h)d]
	/Contents %(content)d 0 R
	/Resources << /XObject <</Im0 1 0 R>> >> >>''' % dict(parent=pages_oid, w=pagew, h=pageh, content=content_oid)))

	pdf.objects[pages_oid].add_page(page_oid)

	def test_basicpage():
	from io import BytesIO

	pdf = PDFGenerator()
	w, h = 1024, 740
	jpeg = MagicJPEGObject(open('shattered-1.jpg', 'rb').read())
	pdf.add_object(jpeg)
	jpeg.add_properties((w, h), pdf)

	pages = PagesObject()
	pages_oid = pdf.add_object(pages)
	root_oid = pdf.add_object(SimpleObject('<< /Type /Catalog /Pages %d 0 R >>' % pages_oid))
	pdf.set_root(root_oid)

	add_image_page(pdf, pages_oid, w, h, 0, 0, 256, 740, 1024, 740)
	add_image_page(pdf, pages_oid, w, h, 256, 0, 256, 740, 1024, 740)
	add_image_page(pdf, pages_oid, w, h, 512, 0, 256, 740, 1024, 740)
	add_image_page(pdf, pages_oid, w, h, 768, 0, 256, 740, 1024, 740)

	outfile = BytesIO()
	pdf.serialize(outfile)
	print(outfile.getvalue().hex())
	assert outfile.getvalue().startswith(prefix1)

	def parse_args(argv):
	import argparse

	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	parser.add_argument('-r', '--resolution', help="PDF resolution to render at, in DPI (default 300)", default=300, type=int)
	parser.add_argument('-q', '--quality', help="JPEG quality to use, 0-100 (default 80)", default=80, type=int)
	parser.add_argument('--progressive', action='store_true',
	help="Use progressive encoding? If not set, this uses a restart interval "
	"encoding scheme which works more often but produces less compatible PDFs.")
	parser.add_argument("file1", help="First input PDF")
	parser.add_argument("file2", help="Second input PDF")

	return parser.parse_args(argv)

	def jpeg_comment(n):
	assert n < 65534, "JPEG contains oversized image block; encoding not possible"
	n += 2
	return b'\xff\xfe' + bytes([n>>8, n & 0xff])

	def merge_jpegs(j1, j2):
	assert j1[:2] == b'\xff\xd8' and j1[-2:] == b'\xff\xd9', "header or footer unexpected"
	assert j2[:2] == b'\xff\xd8' and j2[-2:] == b'\xff\xd9', "header or footer unexpected"

	re_header = b'(?=\xff[\xd0-\xd7\xda])' # look for restart interval or start of scan
	j1k = re.sub(re_header, b'\xff\xff', j1[2:-2])
	j1s = re.split(b'\xff\xff' + re_header, j1k)

	out = bytearray(0x173 - 129)
	# we have 12 bytes between the end of j1 and j2
	out += jpeg_comment(4)
	out += b'nneo'

	for k1 in j1s:
	out += jpeg_comment(4)
	out += jpeg_comment(len(k1) + 4)
	out += k1

	out += b'\xff\xd9'
	out += b'\xff\xd9'

	out += j2[2:]

	return bytes(out)

	def doit(outdir, args):
	pagecounts = [0, 0]

	for i, fn in enumerate([args.file1, args.file2]):
	logger.info("rendering file %d..." % (i+1))
	subprocess.check_call(['gs', '-r%d' % args.resolution, '-sDEVICE=png16m',
	'-o', os.path.join(outdir, 't%d-%%04d.png' % (i+1)), fn])

	for pgn in itertools.count(1):
	if not os.path.exists(os.path.join(outdir, 't%d-%04d.png' % (i+1, pgn))):
	break

	pagecounts[i] = pgn - 1

	if pagecounts[0] != pagecounts[1]:
	raise Exception("Page counts must be equal.")

	pagecount = pagecounts[0]

	if pagecount == 0:
	raise Exception("No pages found - did PDF decoding fail?")

	# find a good packing structure
	basew, baseh = Image.open(os.path.join(outdir, 't1-%04d.png' % pagecount)).size
	rows, cols = 1, 1
	while rows * cols < pagecount:
	if rows * baseh < cols * basew:
	rows += 1
	else:
	cols += 1

	masterw, masterh = basewcols, basehrows
	master1 = Image.new('RGB', (masterw, masterh))
	master2 = Image.new('RGB', (masterw, masterh))

	# pack pages into master images
	logger.info("rendering images")
	page_descs = []
	for i in range(pagecount):
	im1 = Image.open(os.path.join(outdir, 't1-%04d.png' % (i+1)))
	im2 = Image.open(os.path.join(outdir, 't2-%04d.png' % (i+1)))
	if im1.size != im2.size:
	raise Exception("Page %ds aren't the same size!" % (i+1))

	pagew, pageh = im1.size
	r, c = divmod(i, cols)
	px, py, pw, ph = cbasew, rbaseh, basew, baseh
	master1.paste(im1, (px, py, px+pw, py+ph))
	master2.paste(im2, (px, py, px+pw, py+ph))

	page_descs.append((px, py, pw, ph, pagew / args.resolution * 72, pageh / args.resolution * 72))

	# convert images into JPG, with restart interval set to split file into suitable segments
	logger.info("saving master images to TGA")
	master1.save(os.path.join(outdir, 'p1.tga'))
	master2.save(os.path.join(outdir, 'p2.tga'))

	logger.info("converting images to JPG")
	if args.progressive:
	j1enc = ['-progressive']
	else:
	j1enc = ['-restart', '250B']

	subprocess.check_call(['cjpeg', '-quality', str(args.quality)] + j1enc + [
	'-outfile', os.path.join(outdir, 'p1.jpg'),
	'-optimize', '-verbose', '-targa', os.path.join(outdir, 'p1.tga')])
	# image 2 doesn't need restart intervals since it's just pasted right at the end
	subprocess.check_call(['cjpeg', '-quality', str(args.quality),
	'-outfile', os.path.join(outdir, 'p2.jpg'),
	'-optimize', '-verbose', '-targa', os.path.join(outdir, 'p2.tga')])

	with open(os.path.join(outdir, 'p1.jpg'), 'rb') as f1:
	with open(os.path.join(outdir, 'p2.jpg'), 'rb') as f2:
	jpeg_footer = merge_jpegs(f1.read(), f2.read())

	# render PDFs
	logger.info("producing final PDFs")
	pdf = PDFGenerator()
	jpeg = MagicJPEGObject(jpeg1 + jpeg_footer)
	pdf.add_object(jpeg)
	jpeg.add_properties((masterw, masterh), pdf)

	pages = PagesObject()
	pages_oid = pdf.add_object(pages)
	root_oid = pdf.add_object(SimpleObject('<< /Type /Catalog /Pages %d 0 R >>' % pages_oid))
	pdf.set_root(root_oid)

	for page in page_descs:
	add_image_page(pdf, pages_oid, masterw, masterh, *page)

	with open('out-' + os.path.basename(args.file1), 'wb') as outfile:
	pdf.serialize(outfile)

	jpeg.data = jpeg2 + jpeg_footer
	with open('out-' + os.path.basename(args.file2), 'wb') as outfile:
	pdf.serialize(outfile)

	def main(argv):
	args = parse_args(argv)

	outdir = tempfile.mkdtemp()

	try:
	doit(outdir, args)
	finally:
	shutil.rmtree(outdir)

	if __name__ == '__main__':
	import sys
	exit(main(sys.argv[1:]))

If you have any other queries or for more information go through this link:

https://security.googleblog.com/2017/02/announcing-first-sha1-collision.html

venereology answered 1 year ago

Consider inserting n distinct keys into a hash table with m buckets, where collisions are resolved...

Consider inserting n distinct keys into a hash table with m buckets, where collisions are resolved by chaining and simple uniform hashing applies. 1. What is the probability that none of the n keys hashed to a particular bucket? 2. What is the expected number of empty buckets? 3. What is the expected number of collisions?

What does it mean by “MAC times”? What are the possible metadata of PDF documents? What...

What does it mean by “MAC times”? What are the possible metadata of PDF documents? What metadata can you find from 2014 Tax Return Form 1040 from IRS’ website (http://www.irs.gov/pub/irs-pdf/f1040.pdf?portlet=3 )?

What are the (1) hash algorithm identifier, (2) cost factor, (3) salt, and (4) hash value...

What are the (1) hash algorithm identifier, (2) cost factor, (3) salt, and (4) hash value of the following given hashed password that is generated by bcrypt()? $2y$16$.2Boh8Y80U.RaFr1sFdaKuP5/B2Z3I8OM7vUTUvPzD2UEG3VMcXra

Creating SHA-1 Hashes. Provide a screenshot for each of questions 3a – 3d. a. Create a...

Creating SHA-1 Hashes. Provide a screenshot for each of questions 3a – 3d. a. Create a SHA-1 hash of your last name using the code on page 43 in your lab book. Put your name in all lowercase letters. Is the MD the same length as your last name? (20 points) b. Using the up arrow, change the first character of your last name to be an uppercase letter. Changing one character in your last name changes how many characters...

1. Astronomers beleive that there is no planet in the asteroid belt region because a. collisions...

1. Astronomers beleive that there is no planet in the asteroid belt region because a. collisions in the asteroid belt break apart objects instead of letting these objects grow through accretion. b. the magnetic field of Jupiter prevented the material in the asteroid belt from gathering into a planet. c. the asteroid belt is highly inclined to the orbits of the planets so there was not enough material to form a planet. d. orbital resonances with Jupiter disrupted the orbits...

(a) Let X and Y have the joint pdf ???(?, ?)=1, 0≤x≤3/2, 0≤y≤1, zero elsewhere. Find:...

(a) Let X and Y have the joint pdf ???(?, ?)=1, 0≤x≤3/2, 0≤y≤1, zero elsewhere. Find: 1 The pdf of Z=X+Y 2 The pdf of Z=X.Y

Internet Research Worksheet Topic: ________________________________________________________________________ Find two documents from the Internet on the subject of your...

Internet Research Worksheet Topic: ________________________________________________________________________ Find two documents from the Internet on the subject of your next speech. Provide a complete citation for each article following the A.P.A.bibliography format and answer the questions about each document. (For sample bibliography formats check in d2l in the content section) 1. Document: _________________________________________________________________ __________________________________________________________________________ Did you locate this document through a search engine? Yes ❑ No ❑ If you answered yes, what is the name of the search engine? _________________________ If you answered no, how did you locate...

Find the moment generating function of the PDF for the beta distribution. Please show all steps,...

Find the moment generating function of the PDF for the beta distribution. Please show all steps, thank you

(i) Find the marginal probability distributions for the random variables X1 and X2 with joint pdf...

(i) Find the marginal probability distributions for the random variables X1 and X2 with joint pdf f(x1, x2) = 12x1x2(1-x2) , 0 < x1 <1 0 < x2 < 1 , otherwise (ii) Calculate E(X1) and E(X2) (iii) Are the variables X1 and X2 stochastically independent? (iv) Given the variables in the question, find the conditional p.d.f. of X1 given 0<x2< ½ and the conditional expectation E[X1|0<x2< ½ ].

Group 1: 4.2, 4.2, 3.4 Group 2: 4.5, 2.1, 2.3 Group 3: 1.2, 0.3, -0.3, 2.3...

Group 1: 4.2, 4.2, 3.4 Group 2: 4.5, 2.1, 2.3 Group 3: 1.2, 0.3, -0.3, 2.3 Use the Bonferronni method to test each of the 3 possible hypotheses at the 3% significance level. (a) Find the value of the test statistic for each of the 3 possible hypotheses. (b) Which pairs of means are significantly different (using the Bonferronni method at the 3% significance level?

Question