sphinx-web/src/data/document.py

import os
import uuid
from flask import current_app
from web_utils.task import ProcessTask
from web_utils.business_exception import BusinessException
import shutil
from unicodedata import normalize
from urllib.parse import quote as url_encode
from urllib.parse import unquote as url_decode
import string
import json

def os_path_separators():
	seps = ['/','\\']
	for sep in os.path.sep, os.path.altsep:
		if sep:
			seps.append(sep)
	return seps

def sanitize_name(initial_name, slashReplacement = '_', removeUnknownCharacters = True):
	# Sort out unicode characters
	name = normalize('NFKD', initial_name).encode('ascii', 'ignore').decode('ascii')

	# Replace path separators
	for sep in os_path_separators():
		name = name.replace(sep, slashReplacement)

	# Ensure only valid characters
	if removeUnknownCharacters:
		valid_chars = "-_.{0}{1}{2}".format(string.ascii_letters, string.digits, slashReplacement)
		name = "".join(ch for ch in name if ch in valid_chars)

	if len(name) == 0 or '..' in name:
		raise BusinessException("Invalid name: " + initial_name)

	return name

class Document:
	def __init__(self, origin, doc_name, branch = 'master', allow_invalid = False):
		self.origin = Document.decode_origin(origin) if '!' in origin else origin
		self.encoded_origin = Document.encode_origin(self.origin)
		self.doc_name = doc_name
		self.branch = branch

		doc_path = Document.make_doc_path(self.origin, doc_name, branch)
		print(doc_path)
		if not os.path.isdir(doc_path + "/repo/.git"):
			if allow_invalid:
				self.valid = False
				return
			else:
				raise BusinessException("This document does not exist: "+self.origin+'/'+doc_name+"@"+branch)

		self.doc_path = doc_path
		self.valid = True

		# Init default values
		self.settings = {
			'multiversion': False,
			'default_version': '', # Only used if multiversion is True
			'build_pdf': False, # for multiversion, this can be set to an array of strings indicating each branch or tag for which we want to build the PDF
		}

		# Read settings.json (if it exists)
		json_settings_filename = self.doc_path + '/settings.json'
		if os.path.exists(json_settings_filename):
			with open(json_settings_filename) as f:
				json_settings = json.load(f)
			for key in self.settings:
				if key in json_settings:
					self.settings[key] = json_settings[key]

	def build(self):
		multiversion_build = self.settings['multiversion']

		cmd = []

		is_intensive_task = False

		# update source files from git
		cmd.append(['git', 'pull'])

		if multiversion_build:
			# also fetch all branches and tags, so that sphinx-multiversion knows what versions exist and can pull them
			cmd.append(['git', 'fetch', '--all'])
			cmd.append(['bash', '-c', 'for BRANCH in $(git branch -a | grep remotes | grep -v HEAD | grep -v master); do git branch --track "${BRANCH#remotes/origin/}" "${BRANCH}" || git branch -f "${BRANCH#remotes/origin/}" -t "${BRANCH}"; done'])

			# build the HTML version
			cmd.append(['make', 'html_versions', 'BUILDDIR=../build'])

			if type(self.settings['build_pdf']) is list:
				for pdf_branch_name in self.settings['build_pdf']:
					is_intensive_task = True

					# Extract the source files to a temporary directory
					cmd.append(['rm', '-rf', self.doc_path + '/tmp_source'])
					cmd.append(['mkdir', self.doc_path + '/tmp_source'])
					cmd.append(['bash', '-c', 'git archive "'+pdf_branch_name+'" | tar -x -C "' + self.doc_path + '/tmp_source"'])

					# Build the PDF
					cmd.append(['bash', '-c', 'cd "' + self.doc_path + '/tmp_source" && make pdf'])

					# Copy the generated PDF file to the HTML directory, so that it is accessible for download by users
					cmd.append(['cp', self.doc_path + '/tmp_source/build/weasyprint/vheliotech.pdf', self.doc_path + '/build/html_versions/' + pdf_branch_name + '/' + self.doc_name + '.pdf'])

					# Clean up
					cmd.append(['rm', '-rf', self.doc_path + '/tmp_source'])

		else:
			# build the HTML version
			cmd.append(['make', 'html', 'BUILDDIR=../build'])

			if self.settings['build_pdf']:
				is_intensive_task = True

				# build the PDF version
				cmd.append(['make', 'pdf', 'BUILDDIR=../build'])

				# Copy the generated PDF file to the HTML directory, so that it is accessible for download by users
				cmd.append(['cp', self.doc_path + '/build/weasyprint/vheliotech.pdf', self.doc_path + '/build/html/' + self.doc_name + '.pdf'])

		# Now that the build is successful, move it to the deployment directory (replacing any existing content)
		cmd.append(['rm', '-rf', self.doc_path + '/dist'])
		if multiversion_build:
			cmd.append(['mv', self.doc_path + '/build/html_versions/', self.doc_path + '/dist/'])
		else:
			cmd.append(['mv', self.doc_path + '/build/html/', self.doc_path + '/dist/'])

		resource_usage = [('document_files', self.doc_path)]
		if is_intensive_task:
			resource_usage.append(('intensive_task', ''))

		task = ProcessTask(cmd, cwd = self.doc_path + "/repo", resource_usage = resource_usage)
		task.start(skip_if_another_pending = self.doc_path)

		return task

	def delete(self):
		if not self.valid:
			raise Exception("Internal error")
		self.delete_folder()

	def delete_folder(self):
		doc_path = Document.make_doc_path(self.origin, self.doc_name, self.branch)
		shutil.rmtree(doc_path)
		doc_root = os.path.dirname(doc_path)
		if len(os.listdir(doc_root)) == 0:
			os.rmdir(doc_root)
			origin_root = os.path.dirname(doc_root)
			if len(os.listdir(origin_root)) == 0:
				os.rmdir(origin_root)

	def get_url(self):
		if self.settings['multiversion']:
			return "/doc/" + self.encoded_origin + "/" + sanitize_name(self.doc_name)+'/'+sanitize_name(self.branch) + "/" + self.settings['default_version'] + "/index.html"
		else:
			return "/doc/" + self.encoded_origin + "/" + sanitize_name(self.doc_name)+'/'+sanitize_name(self.branch) + "/index.html"

	def get_pdf_url(self):
		if self.settings['multiversion']:
			return "/doc/" + self.encoded_origin + "/" + sanitize_name(self.doc_name)+'/'+sanitize_name(self.branch) + "/" + self.settings['default_version'] + "/" + self.doc_name + ".pdf"
		else:
			return "/doc/" + self.encoded_origin + "/" + sanitize_name(self.doc_name)+'/'+sanitize_name(self.branch) + "/" + self.doc_name + ".pdf"

	def get_api_key(self):
		with open(self.doc_path + "/apikey") as f:
			return f.read().replace('\n', '')

	@staticmethod
	def encode_origin(origin):
		return url_encode(origin, safe='').replace('%', '!')

	@staticmethod
	def decode_origin(origin):
		return url_decode(origin.replace('!', '%'))

	@staticmethod
	def make_doc_path(origin, doc_name, branch):
		doc_path = os.path.realpath(get_document_root()+'/'+Document.encode_origin(origin)+'/'+sanitize_name(doc_name)+'/'+sanitize_name(branch))
		if not doc_path.startswith(get_document_root()):
			raise BusinessException("Invalid document path for "+origin+"/"+doc_name+"@"+branch)
		return doc_path

	@staticmethod
	def get_origin(repo):
		result = sanitize_name(os.path.dirname(repo).replace('https://', ''), '/', False)
		if '!' in result:
			raise BusinessException("Invalid character: !")
		return result

	@staticmethod
	def clone(repo, branch, doc_name, source_dir):
		# check the document does not already exist
		origin = Document.get_origin(repo)
		doc_path = Document.make_doc_path(origin, doc_name, branch)
		if os.path.isdir(doc_path):
			raise BusinessException("This document already exists: "+origin+"/"+doc_name+"@"+branch)

		if source_dir != sanitize_name(source_dir):
			raise BusinessException("Invalid source directory name: " + source_dir)

		# we have potentially serious security issues related to cloning anything. For example cloning from SSH may use a pre-configured server identity, etc.
		if not repo.startswith("https://"):
			raise BusinessException("Only HTTPS repositories are allowed in current implementation")

		# Generate an API key
		apikey = str(uuid.uuid4())
		print("generated API key: " + apikey)

		target_dir = doc_path + "/repo"
		os.makedirs(target_dir, exist_ok = True)
		with open(doc_path + "/apikey", "w") as apikey_file:
			apikey_file.write(apikey)

		cmd = []
		cmd.append(['git', 'init', '--initial-branch=' + branch])
		cmd.append(['git', 'remote', 'add', '-f', 'origin',  repo])
		#cmd.append(['git', 'sparse-checkout', 'init'])
		#cmd.append(['git', 'sparse-checkout', 'set', source_dir])
		cmd.append(['git', 'pull', 'origin', branch])
		cmd.append(['git', 'branch', '--set-upstream-to=origin/' + branch, branch])

		task = ProcessTask(cmd, cwd = target_dir)
		task.on_fail(lambda : shutil.rmtree(doc_path, ignore_errors = True))
		task.start()

		return task

	@staticmethod
	def list():
		result = []
		for origin in os.listdir(get_document_root()):
			for doc_name in os.listdir(get_document_root() + "/" + origin):
				for branch in os.listdir(get_document_root() + "/" + origin + "/" + doc_name):
					doc = Document(origin, doc_name, branch, allow_invalid = True)
					result.append(doc)
		return result

def get_document_root():
	return current_app.config['DOCUMENT_ROOT_DIR']