diff --git a/pyproject.toml b/pyproject.toml index 1fbc250..4d629ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,14 +47,15 @@ dev = [ [tool.setuptools] package-dir = {"" = "src"} -py-modules = ["cedarscript_editor"] +py-modules = ["cedarscript_editor", "text_manipulaiton"] [tool.setuptools.dynamic] version = {attr = "cedarscript_editor.__version__"} [tool.setuptools.packages.find] where = ["src"] -include = ["cedarscript_editor*", "text_editor*"] +include = ["cedarscript_editor*", "text_editor*", "identifier_selector*", "*identifier_finder*", +"indentation_*", "range_*"] namespaces = false [tool.setuptools.package-data] diff --git a/src/cedarscript_editor/__init__.py b/src/cedarscript_editor/__init__.py index 9d6c692..6f0f36e 100644 --- a/src/cedarscript_editor/__init__.py +++ b/src/cedarscript_editor/__init__.py @@ -1,7 +1,6 @@ -from .cedarscript_editor_java import JavaCEDARScriptEditor -from .cedarscript_editor_kotlin import KotlinCEDARScriptEditor -from .cedarscript_editor_python import PythonCEDARScriptEditor +from cedarscript_editor.cedarscript_editor import CEDARScriptEditor -__version__ = "0.1.10" +__version__ = "0.2.0" + +__all__ = ["CEDARScriptEditor"] -__all__ = ["PythonCEDARScriptEditor"] diff --git a/src/cedarscript_editor/cedarscript_editor.py b/src/cedarscript_editor/cedarscript_editor.py new file mode 100644 index 0000000..39eeebe --- /dev/null +++ b/src/cedarscript_editor/cedarscript_editor.py @@ -0,0 +1,349 @@ +import os +from collections.abc import Sequence +from typing import Callable + +from cedarscript_ast_parser import Command, RmFileCommand, MvFileCommand, UpdateCommand, \ + SelectCommand, IdentifierFromFile, Segment, Marker, MoveClause, DeleteClause, \ + InsertClause, ReplaceClause, EditingAction, BodyOrWhole, RegionClause, MarkerType +from cedarscript_ast_parser.cedarscript_ast_parser import MarkerCompatible, RelativeMarker, \ + RelativePositionType +from text_manipulation.indentation_kit import IndentationInfo +from text_manipulation.range_spec import IdentifierBoundaries, RangeSpec +from text_manipulation.text_editor_kit import read_file, write_file, bow_to_search_range + +from .identifier_selector import select_finder + + +class CEDARScriptEditorException(Exception): + def __init__(self, command_ordinal: int, description: str): + match command_ordinal: + case 0 | 1: + items = '' + case 2: + items = "#1" + case 3: + items = "#1 and #2" + case _: + sequence = ", ".join(f'#{i}' for i in range(1, command_ordinal - 1)) + items = f"{sequence} and #{command_ordinal - 1}" + if command_ordinal <= 1: + note = '' + previous_cmd_notes = '' + else: + + previous_cmd_notes = ( + f", bearing in mind the file was updated and now contains all changes expressed in " + f"commands {items}" + ) + if 'syntax' in description.casefold(): + probability_indicator = "most probably" + else: + probability_indicator= "might have" + + note = ( + f"*ALL* commands *before* command #{command_ordinal} were applied and *their changes are already committed*. " + f"Re-read the file to catch up with the applied changes." + f"ATTENTION: The previous command (#{command_ordinal - 1}) {probability_indicator} caused command #{command_ordinal} to fail " + f"due to changes that left the file in an invalid state (check that by re-analyzing the file!)" + ) + super().__init__( + f"COMMAND #{command_ordinal}{note}" + f"{description}" + "NEVER apologize; just relax, take a deep breath, think step-by-step and write an in-depth analysis of what went wrong " + "(specifying which command ordinal failed), then acknowledge which commands were already applied and concisely describe the state at which the file was left " + "(saying what needs to be done now), " + f"then write new commands that will fix the problem{previous_cmd_notes} " + "(you'll get a one-million dollar tip if you get it right!) " + "Use descriptive comment before each command." + ) + + +class CEDARScriptEditor: + def __init__(self, root_path): + self.root_path = os.path.abspath(root_path) + print(f'[{self.__class__}] root: {self.root_path}') + + # TODO Add 'target_search_range: RangeSpec' parameter + def find_identifier(self, source_info: tuple[str, str | Sequence[str]], marker: Marker) -> IdentifierBoundaries: + file_path = source_info[0] + source = source_info[1] + if not isinstance(source, str): + source = '\n'.join(source) + return ( + select_finder(self.root_path, file_path, source) + (self.root_path, file_path, source, marker) + ) + + def apply_commands(self, commands: Sequence[Command]): + result = [] + for i, command in enumerate(commands): + try: + match command: + case UpdateCommand() as cmd: + result.append(self._update_command(cmd)) + # case CreateCommand() as cmd: + # result.append(self._create_command(cmd)) + case RmFileCommand() as cmd: + result.append(self._rm_command(cmd)) + case MvFileCommand() as cmd: + raise ValueError('Noy implemented: MV') + case SelectCommand() as cmd: + raise ValueError('Noy implemented: SELECT') + case _ as invalid: + raise ValueError(f"Unknown command '{type(invalid)}'") + except Exception as e: + print(f'[apply_commands] (command #{i+1}) Failed: {command}') + if isinstance(command, UpdateCommand): + print(f'CMD CONTENT: ***{command.content}***') + raise CEDARScriptEditorException(i + 1, str(e)) from e + return result + + def _update_command(self, cmd: UpdateCommand): + action: EditingAction = cmd.action + target = cmd.target + content = cmd.content or [] + file_path = os.path.join(self.root_path, target.file_path) + + # Example 1: + # UPDATE FILE "tmp.benchmarks/2024-10-04-22-59-58--CEDARScript-Gemini-small/bowling/bowling.py" + # INSERT INSIDE FUNCTION "__init__" TOP + # WITH CONTENT ''' + # @0:print("This line will be inserted at the top") + # '''; + # After parsing -> + # UpdateCommand( + # type='update', + # target=SingleFileClause(file_path='tmp.benchmarks/2024-10-04-22-59-58--CEDARScript-Gemini-small/bowling/bowling.py'), + # action=InsertClause(insert_position=RelativeMarker(type=, value='__init__', offset=None)), + # content='\n @0:print("This line will be inserted at the top")\n ' + # ) + + + # Example 2: + # UPDATE FUNCTION + # FROM FILE "tmp.benchmarks/2024-10-04-22-59-58--CEDARScript-Gemini-small/bowling/bowling.py" + # WHERE NAME = "__init__" + # REPLACE SEGMENT + # STARTING AFTER LINE "def __init__(self):" + # ENDING AFTER LINE "def __init__(self):" + # WITH CONTENT ''' + # @0:print("This line will be inserted at the top") + # '''; + # After parsing -> + # UpdateCommand( + # type='update', + # target=IdentifierFromFile(file_path='bowling.py', + # where_clause=WhereClause(field='NAME', operator='=', value='__init__'), + # identifier_type='FUNCTION', offset=None + # ), + # action=ReplaceClause( + # region=Segment( + # start=RelativeMarker(type=, value='def __init__(self):', offset=None), + # end=RelativeMarker(type=, value='def __init__(self):', offset=None) + # )), + # content='\n @0:print("This line will be inserted at the top")\n ' + # ) + + src = read_file(file_path) + lines = src.splitlines() + + source_info: tuple[str, str | Sequence[str]] = (file_path, src) + + def identifier_resolver(m: Marker): + return self.find_identifier(source_info, m) + + match action: + case MoveClause(): + # (Check parse_update_command) + # when action=MoveClause example (MOVE roll TO AFTER score): + # action.deleteclause.region=WHOLE + # action.as_marker = action.insertclause.as_marker + # action.insertclause.insert_position=FUNCTION(score) + # target.as_marker = FUNCTION(roll) (the one to delete) + search_range = RangeSpec.EMPTY + move_src_range = restrict_search_range(action, target, identifier_resolver) + case _: + move_src_range = None + # Set range_spec to cover the identifier + search_range = restrict_search_range(action, target, identifier_resolver) + + marker, search_range = find_marker_or_segment(action, lines, search_range) + + search_range = restrict_search_range_for_marker( + marker, action, lines, search_range, identifier_resolver + ) + + match content: + case str() | [str(), *_] | (str(), *_): + pass + case (region, relindent): + dest_indent = search_range.indent + content_range = restrict_search_range_for_marker( + region, action, lines, RangeSpec.EMPTY, identifier_resolver + ) + content = content_range.read(lines) + count = dest_indent + (relindent or 0) + content = IndentationInfo.from_content(content).shift_indentation( + content, count + ) + content = (region, content) + case _: + match action: + case MoveClause(insert_position=region, relative_indentation=relindent): + dest_range = restrict_search_range_for_marker( + region, action, lines, RangeSpec.EMPTY, identifier_resolver + ) + dest_indent = dest_range.indent + content = move_src_range.read(lines) + count = dest_indent + (relindent or 0) + content = IndentationInfo.from_content(content).shift_indentation( + content, count + ) + case _: + raise ValueError(f'Invalid content: {content}') + + self._apply_action(action, lines, search_range, content) + + write_file(file_path, lines) + + return f"Updated {target if target else 'file'} in {file_path}\n -> {action}" + + def _apply_action(self, action: EditingAction, lines: Sequence[str], range_spec: RangeSpec, content: str | None = None): + match action: + + case MoveClause(insert_position=insert_position, to_other_file=other_file, relative_indentation=relindent): + # TODO Move from 'lines' to the same file or to 'other_file' + range_spec.write(content, lines) + + case DeleteClause(): + range_spec.delete(lines) + + case ReplaceClause() | InsertClause(): + match content: + case (region, processed_content): + content = processed_content + case str(): + content = IndentationInfo.from_content(lines).apply_relative_indents( + content, range_spec.indent + ) + + range_spec.write(content, lines) + + case _ as invalid: + raise ValueError(f"Unsupported action type: {type(invalid)}") + + def _rm_command(self, cmd: RmFileCommand): + file_path = os.path.join(self.root_path, cmd.file_path) + + def _delete_function(self, cmd): # TODO + file_path = os.path.join(self.root_path, cmd.file_path) + + # def _create_command(self, cmd: CreateCommand): + # file_path = os.path.join(self.root_path, cmd.file_path) + # + # os.makedirs(os.path.dirname(file_path), exist_ok=False) + # with open(file_path, 'w') as file: + # file.write(content) + # + # return f"Created file: {command['file']}" + + def find_index_range_for_region(self, + region: BodyOrWhole | Marker | Segment | RelativeMarker, + lines: Sequence[str], + identifier_resolver: Callable[[Marker], IdentifierBoundaries], + search_range: RangeSpec | IdentifierBoundaries | None = None, + ) -> RangeSpec: + # BodyOrWhole | RelativeMarker | MarkerOrSegment + # marker_or_segment_to_index_range_impl + # IdentifierBoundaries.location_to_search_range(self, location: BodyOrWhole | RelativePositionType) -> RangeSpec + match region: + case BodyOrWhole() as bow: + # TODO Set indent char count + index_range = bow_to_search_range(bow, search_range) + case Marker() | Segment() as mos: + if isinstance(search_range, IdentifierBoundaries): + search_range = search_range.whole + match mos: + case Marker(type=marker_type): + match marker_type: + case MarkerType.LINE: + pass + case _: + # TODO transform to RangeSpec + mos = self.find_identifier(("find_index_range_for_region", lines), mos).body + index_range = mos.to_search_range( + lines, + search_range.start if search_range else 0, + search_range.end if search_range else -1, + ) + case _ as invalid: + raise ValueError(f"Invalid: {invalid}") + return index_range + + +def find_marker_or_segment(action: EditingAction, lines: Sequence[str], search_range: RangeSpec) -> tuple[Marker, RangeSpec]: + marker: Marker | Segment | None = None + match action: + case MarkerCompatible() as marker_compatible: + marker = marker_compatible.as_marker + case RegionClause(region=region): + match region: + case MarkerCompatible(): + marker = region.as_marker + case Segment() as segment: + # TODO Handle segment's start and end as a marker and support identifier markers + search_range = segment.to_search_range(lines, search_range) + marker = None + return marker, search_range + + +def restrict_search_range(action, target, identifier_resolver: Callable[[Marker], IdentifierBoundaries]) -> RangeSpec: + search_range = RangeSpec.EMPTY + match target: + case IdentifierFromFile() as identifier_from_file: + identifier_marker = identifier_from_file.as_marker + identifier_boundaries = identifier_resolver(identifier_marker) + if not identifier_boundaries: + raise ValueError(f"'{identifier_marker}' not found") + match action: + case RegionClause(region=region): + match region: # BodyOrWhole | Marker | Segment + case BodyOrWhole(): + search_range = identifier_boundaries.location_to_search_range(region) + case _: + search_range = identifier_boundaries.location_to_search_range(BodyOrWhole.WHOLE) + return search_range + + +def restrict_search_range_for_marker( + marker: Marker, + action: EditingAction, + lines: Sequence[str], + search_range: RangeSpec, + identifier_resolver: Callable[[Marker], IdentifierBoundaries] +) -> RangeSpec: + if marker is None: + return search_range + + match marker: + case Marker(): + match marker.type: + case MarkerType.LINE: + search_range = marker.to_search_range(lines, search_range) + match action: + case InsertClause(): + if action.insert_position.qualifier == RelativePositionType.BEFORE: + search_range = search_range.inc() + case DeleteClause(): + search_range = search_range.set_length(1) + case _: + identifier_boundaries = identifier_resolver(marker) + if not identifier_boundaries: + raise ValueError(f"'{marker}' not found") + qualifier: RelativePositionType = marker.qualifier if isinstance( + marker, RelativeMarker + ) else RelativePositionType.AT + search_range = identifier_boundaries.location_to_search_range(qualifier) + case Segment(): + pass # TODO + return search_range diff --git a/src/cedarscript_editor/cedarscript_editor_base.py b/src/cedarscript_editor/cedarscript_editor_base.py deleted file mode 100644 index 8e3321c..0000000 --- a/src/cedarscript_editor/cedarscript_editor_base.py +++ /dev/null @@ -1,200 +0,0 @@ -import os -from abc import ABC, abstractmethod - -from cedarscript_ast_parser import Command, CreateCommand, RmFileCommand, MvFileCommand, UpdateCommand, \ - SelectCommand, IdentifierFromFile, SingleFileClause, Segment, Marker, MoveClause, DeleteClause, \ - InsertClause, ReplaceClause, EditingAction, Region, BodyOrWhole, WhereClause, RegionClause -from .text_editor_kit import \ - normalize_indent, write_file, read_file, bow_to_search_range, \ - FunctionBoundaries, SearchRange, analyze_indentation, IndentationInfo - -class CEDARScriptEditorException(Exception): - def __init__(self, command_ordinal: int, description: str): - match command_ordinal: - case 0 | 1: - items = '' - case 2: - items = "#1" - case 3: - items = "#1 and #2" - case _: - sequence = ", ".join(f'#{i}' for i in range(1, command_ordinal - 1)) - items = f"{sequence} and #{command_ordinal - 1}" - if command_ordinal <= 1: - note = '' - plural_indicator='' - previous_cmd_notes = '' - else: - - plural_indicator='s' - previous_cmd_notes = f", bearing in mind the file was updated and now contains all changes expressed in command{plural_indicator} {items}" - if 'syntax' in description.casefold(): - probability_indicator = "most probably" - else: - probability_indicator= "might have" - - note = ( - f"*ALL* commands *before* command #{command_ordinal} were applied and *their changes are already committed*. " - f"Re-read the file to catch up with the applied changes." - f"ATTENTION: The previous command (#{command_ordinal - 1}) {probability_indicator} caused command #{command_ordinal} to fail " - f"due to changes that left the file in an invalid state (check that by re-analyzing the file!)" - ) - super().__init__( - f"COMMAND #{command_ordinal}{note}" - f"{description}" - "NEVER apologize; just relax, take a deep breath, think step-by-step and write an in-depth analysis of what went wrong " - "(specifying which command ordinal failed), then acknowledge which commands were already applied and concisely describe the state at which the file was left " - "(saying what needs to be done now), " - f"then write new commands that will fix the problem{previous_cmd_notes} " - "(you'll get a one-million dollar tip if you get it right!) " - "Use descriptive comment before each command." - ) - - -class CEDARScriptEditorBase(ABC): - def __init__(self, root_path): - self.root_path = os.path.abspath(root_path) - print(f'[{self.__class__}] root: {self.root_path}') - - # TODO Add search_range: SearchRange parameter - def find_function(self, source: str | list[str], file_name: str, function_name: str, offset: int | None = None) -> FunctionBoundaries: - if not isinstance(source, str): - source = '\n'.join(source) - return self._find_function(source, file_name, function_name, offset) - - @abstractmethod - def _find_function(self, source: str, file_name: str, function_name: str, offset: int | None = None) -> FunctionBoundaries | None: - pass - - def apply_commands(self, commands: list[Command]): - result = [] - for i, command in enumerate(commands): - try: - match command: - case UpdateCommand() as cmd: - result.append(self._update_command(cmd)) - case CreateCommand() as cmd: - result.append(self._create_command(cmd)) - case RmFileCommand() as cmd: - result.append(self._rm_command(cmd)) - case MvFileCommand() as cmd: - raise ValueError('Noy implemented: MV') - case SelectCommand() as cmd: - raise ValueError('Noy implemented: SELECT') - case _ as invalid: - raise ValueError(f"Unknown command '{type(invalid)}'") - except Exception as e: - print(f'[apply_commands] (command #{i+1}) Failed: {command}') - if isinstance(command, UpdateCommand): - print(f'CMD CONTENT: ***{command.content}***') - raise CEDARScriptEditorException(i + 1, str(e)) from e - return result - - def _update_command(self, cmd: UpdateCommand): - file_path = os.path.join(self.root_path, cmd.target.file_path) - content = cmd.content or [] - - match cmd.target: - - case IdentifierFromFile( - identifier_type='FUNCTION', where_clause=WhereClause(field='NAME', operator='=', value=function_name) - ): - try: - return self._update_content(file_path, cmd.action, content, function_name=function_name, offset = cmd.target.offset) - except IOError as e: - msg = f"function `{function_name}` in `{cmd.target.file_path}`" - raise IOError(f"Error updating {msg}: {e}") - - case SingleFileClause(): - try: - return self._update_content(file_path, cmd.action, content) - except IOError as e: - msg = f"file `{cmd.target.file_path}`" - raise IOError(f"Error updating {msg}: {e}") - - case _ as invalid: - raise ValueError(f"Not implemented: {invalid}") - - def _update_content(self, file_path: str, action: EditingAction, content: str | None, - search_range: SearchRange | None = None, function_name: str | None = None, offset: int | None = None) -> str: - src = read_file(file_path) - lines = src.splitlines() - - if function_name: - function_boundaries = self.find_function(src, file_path, function_name, offset) - if not function_boundaries: - raise ValueError(f"Function '{function_name}' not found in {file_path}") - if search_range: - print(f'Discarding search range to use function range...') - search_range = _get_index_range(action, lines, function_boundaries) - else: - search_range = _get_index_range(action, lines) - - self._apply_action(action, lines, search_range, content) - - write_file(file_path, lines) - - return f"Updated {'function ' + function_name if function_name else 'file'} in {file_path}\n -> {action}" - - def _apply_action(self, action: EditingAction, lines: list[str], search_range: SearchRange, content: str | None = None): - index_start, index_end, reference_indent = search_range - - match action: - - case MoveClause(insert_position=insert_position, to_other_file=other_file, relative_indentation=relindent): - saved_content = lines[index_start:index_end] - lines[index_start:index_end] = [] - # TODO Move from 'lines' to the same file or to 'other_file' - dest_range = _get_index_range(InsertClause(insert_position), lines) - indentation_info: IndentationInfo = analyze_indentation(saved_content) - lines[dest_range.start:dest_range.end] = indentation_info.adjust_indentation(saved_content, dest_range.indent + (relindent or 0)) - - case DeleteClause(): - lines[index_start:index_end] = [] - - case ReplaceClause() | InsertClause(): - indentation_info: IndentationInfo = analyze_indentation(lines) - lines[index_start:index_end] = normalize_indent(content, reference_indent, indentation_info) - - case _ as invalid: - raise ValueError(f"Unsupported action type: {type(invalid)}") - - def _rm_command(self, cmd: RmFileCommand): - file_path = os.path.join(self.root_path, cmd.file_path) - - def _delete_function(self, cmd): # TODO - file_path = os.path.join(self.root_path, cmd.file_path) - - # def _create_command(self, cmd: CreateCommand): - # file_path = os.path.join(self.root_path, cmd.file_path) - # - # os.makedirs(os.path.dirname(file_path), exist_ok=False) - # with open(file_path, 'w') as file: - # file.write(content) - # - # return f"Created file: {command['file']}" - - -def _get_index_range(action: EditingAction, lines: list[str], search_range: SearchRange | FunctionBoundaries | None = None) -> SearchRange: - match action: - case RegionClause(region=r) | InsertClause(insert_position=r): - return find_index_range_for_region(r, lines, search_range) - case _ as invalid: - raise ValueError(f"Unsupported action type: {type(invalid)}") - -def find_index_range_for_region(region: Region, lines: list[str], search_range: SearchRange | FunctionBoundaries | None = None) -> SearchRange: - match region: - case BodyOrWhole() as bow: - # TODO Set indent char count - index_range = bow_to_search_range(bow, search_range) - case Marker() | Segment() as mos: - if isinstance(search_range, FunctionBoundaries): - search_range = search_range.whole - index_range = mos.marker_or_segment_to_index_range( - lines, - search_range.start if search_range else 0, - search_range.end if search_range else -1, - ) - case _ as invalid: - raise ValueError(f"Invalid: {invalid}") - return index_range diff --git a/src/cedarscript_editor/cedarscript_editor_java.py b/src/cedarscript_editor/cedarscript_editor_java.py deleted file mode 100644 index fe2da8e..0000000 --- a/src/cedarscript_editor/cedarscript_editor_java.py +++ /dev/null @@ -1,56 +0,0 @@ -import re -import os -from .cedarscript_editor_base import CEDARScriptEditorBase - -class JavaCEDARScriptEditor(CEDARScriptEditorBase): - def _find_function(self, lines, function_name): - # Java method pattern: [modifiers] [return type] methodName( - pattern = re.compile(rf'^\s*(public|protected|private|static|\s) +[\w<>\[\]]+\s+{re.escape(function_name)}\s*\(') - for i, line in enumerate(lines): - if pattern.search(line): - return i - return None - - def _find_function_end(self, lines, start_index): - brace_count = 0 - in_string = False - string_delimiter = None - for i in range(start_index, len(lines)): - for char in lines[i]: - if char in ['"', "'"]: - if not in_string: - in_string = True - string_delimiter = char - elif string_delimiter == char: - in_string = False - string_delimiter = None - elif not in_string: - if char == '{': - brace_count += 1 - elif char == '}': - brace_count -= 1 - if brace_count == 0: - return i + 1 - return len(lines) - - def _create_command(self, command): - file_path = os.path.join(self.root_path, command['file_path']) - insert_position = command['insert_position'] - content = command['content'] - - with open(file_path, 'r') as file: - lines = file.readlines() - - marker = insert_position.split('"')[1] - for i, line in enumerate(lines): - if marker in line: - # In Java, we typically want to insert methods inside a class - class_indent = len(line) - len(line.lstrip()) - indented_content = '\n'.join(' ' * (class_indent + 4) + l for l in content.split('\n')) - lines.insert(i + 1, indented_content + '\n\n') - break - - with open(file_path, 'w') as file: - file.writelines(lines) - - return f"Created method in {command['file_path']}" diff --git a/src/cedarscript_editor/cedarscript_editor_kotlin.py b/src/cedarscript_editor/cedarscript_editor_kotlin.py deleted file mode 100644 index 2b0a48a..0000000 --- a/src/cedarscript_editor/cedarscript_editor_kotlin.py +++ /dev/null @@ -1,32 +0,0 @@ -import re -from .cedarscript_editor_base import CEDARScriptEditorBase - -class KotlinCEDARScriptEditor(CEDARScriptEditorBase): - def _find_function(self, lines, function_name): - pattern = re.compile(rf'^\s*fun\s+{re.escape(function_name)}\s*[\(<]') - for i, line in enumerate(lines): - if pattern.match(line): - return i - return None - - def _find_function_end(self, lines, start_index): - brace_count = 0 - in_string = False - string_delimiter = None - for i in range(start_index, len(lines)): - for char in lines[i]: - if char in ['"', "'"]: - if not in_string: - in_string = True - string_delimiter = char - elif string_delimiter == char: - in_string = False - string_delimiter = None - elif not in_string: - if char == '{': - brace_count += 1 - elif char == '}': - brace_count -= 1 - if brace_count == 0: - return i + 1 - return len(lines) diff --git a/src/cedarscript_editor/cedarscript_editor_python.py b/src/cedarscript_editor/cedarscript_editor_python.py deleted file mode 100644 index d9bcb9e..0000000 --- a/src/cedarscript_editor/cedarscript_editor_python.py +++ /dev/null @@ -1,68 +0,0 @@ -from .cedarscript_editor_base import CEDARScriptEditorBase, FunctionBoundaries -import rope.base.project -from rope.base import libutils, ast - -from .text_editor_kit import SearchRange, get_line_indent_count - - -def get_by_offset(obj: list, offset: int): - if 0 <= offset < len(obj): - return obj[offset] - return None - -class PythonCEDARScriptEditor(CEDARScriptEditorBase): - """ - A class to handle Python code editing operations. - """ - - # TODO Support search_start_line, search_end_line - def _find_function(self, source: str, file_name: str, function_name: str, offset: int | None = None) -> FunctionBoundaries | None: - """ - Find the starting line index of a specified function in the given lines. - - :param source: Source code. - :param function_name: Name of the function to find. - :param offset: how many functions to skip. TODO: If `None` when there are 2 or more functions with the same name, raise exception. - :return: FunctionBoundaries with function start, body start, and end lines of the function or None if not found. - """ - project = rope.base.project.Project(self.root_path) - resource = libutils.path_to_resource(project, file_name) - pymodule = libutils.get_string_module(project, source, resource=resource) - - candidates: list[FunctionBoundaries] = [] - lines = source.splitlines() - # Use rope's AST to find the function - for node in ast.walk(pymodule.get_ast()): - if not isinstance(node, ast.FunctionDef) or node.name != function_name: - continue - start_line = node.lineno - body_start_line = node.body[0].lineno if node.body else start_line - # Find the last line by traversing all child nodes - end_line = start_line - for child in ast.walk(node): - if hasattr(child, 'lineno'): - end_line = max(end_line, child.lineno) - # TODO Set indentation for all 3 lines - candidates.append(FunctionBoundaries( - SearchRange(start_line - 1, end_line, get_line_indent_count(lines[start_line - 1])), - SearchRange(body_start_line - 1, end_line, get_line_indent_count(lines[body_start_line - 1])) - )) - - candidate_count = len(candidates) - if not candidate_count: - return None - if candidate_count > 1 and offset is None: - raise ValueError( - f"There are {candidate_count} functions named `{function_name}` in file `{file_name}`. " - f"Use `OFFSET <0..{candidate_count - 1}>` to determine how many to skip. " - f"Example to reference the *last* `{function_name}`: `OFFSET {candidate_count - 1}`" - ) - if offset and offset >= candidate_count: - raise ValueError( - f"There are only {candidate_count} functions named `{function_name} in file `{file_name}`, " - f"but 'offset' was set to {offset} (you can only skip {candidate_count - 1} functions)" - ) - candidates.sort(key=lambda x: x.start_line) - return get_by_offset(candidates, offset or 0) - - diff --git a/src/cedarscript_editor/identifier_selector.py b/src/cedarscript_editor/identifier_selector.py new file mode 100644 index 0000000..f096a51 --- /dev/null +++ b/src/cedarscript_editor/identifier_selector.py @@ -0,0 +1,18 @@ +from typing import Callable + +from cedarscript_ast_parser import Marker + +import logging + +from cedarscript_editor.python_identifier_finder import find_python_identifier +from text_manipulation.range_spec import IdentifierBoundaries + +_log = logging.getLogger(__name__) + + +def select_finder( + root_path: str, file_name: str, source: str +) -> Callable[[str, str, str, Marker], IdentifierBoundaries | None]: + # TODO + _log.info("[select_finder] Python selected") + return find_python_identifier diff --git a/src/cedarscript_editor/python_identifier_finder.py b/src/cedarscript_editor/python_identifier_finder.py new file mode 100644 index 0000000..77e0c46 --- /dev/null +++ b/src/cedarscript_editor/python_identifier_finder.py @@ -0,0 +1,74 @@ +import rope +from cedarscript_ast_parser import Marker, MarkerType +from rope.base import ast, libutils +from collections.abc import Sequence + +from text_manipulation.range_spec import IdentifierBoundaries, RangeSpec +from text_manipulation.indentation_kit import get_line_indent_count + + +def get_by_offset(obj: Sequence, offset: int): + if 0 <= offset < len(obj): + return obj[offset] + return None + + +def find_python_identifier(root_path: str, file_name: str, source: str, marker: Marker) -> IdentifierBoundaries | None: + """ + Find the starting line index of a specified function in the given lines. + + :param root_path: + :param file_name: + :param source: Source code. + :param marker: Type, name and offset of the identifier to find. + TODO: If `None` when there are 2 or more identifiers with the same name, raise exception. + :return: IdentifierBoundaries with identifier start, body start, and end lines of the identifier + or None if not found. + """ + project = rope.base.project.Project(root_path) + resource = libutils.path_to_resource(project, file_name) + pymodule = libutils.get_string_module(project, source, resource=resource) + + candidates: list[IdentifierBoundaries] = [] + lines = source.splitlines() + # Use rope's AST to find the identifier + match marker.type: + case MarkerType.FUNCTION: + ast_type = ast.FunctionDef + case MarkerType.CLASS: + ast_type = ast.ClassDef + case _: + raise ValueError(f'Invalid identifier type: {marker.type}') + for node in ast.walk(pymodule.get_ast()): + if not isinstance(node, ast_type) or node.name != marker.value: + continue + start_line = node.lineno + body_start_line = node.body[0].lineno if node.body else start_line + # Find the last line by traversing all child nodes + end_line = start_line + for child in ast.walk(node): + if hasattr(child, 'lineno'): + end_line = max(end_line, child.lineno) + # TODO Set indentation for all 3 lines + candidates.append(IdentifierBoundaries( + RangeSpec(start_line - 1, end_line, get_line_indent_count(lines[start_line - 1])), + RangeSpec(body_start_line - 1, end_line, get_line_indent_count(lines[body_start_line - 1])) + )) + + candidate_count = len(candidates) + if not candidate_count: + return None + if candidate_count > 1 and marker.offset is None: + raise ValueError( + f"There are {candidate_count} functions named `{marker.value}` in file `{file_name}`. " + f"Use `OFFSET <0..{candidate_count - 1}>` to determine how many to skip. " + f"Example to reference the *last* `{marker.value}`: `OFFSET {candidate_count - 1}`" + ) + if marker.offset and marker.offset >= candidate_count: + raise ValueError( + f"There are only {candidate_count} functions named `{marker.value} in file `{file_name}`, " + f"but 'offset' was set to {marker.offset} (you can only skip {candidate_count - 1} functions)" + ) + candidates.sort(key=lambda x: x.start_line) + result: IdentifierBoundaries = get_by_offset(candidates, marker.offset or 0) + return result diff --git a/src/cedarscript_editor/text_editor_kit.py b/src/cedarscript_editor/text_editor_kit.py deleted file mode 100644 index f586ee6..0000000 --- a/src/cedarscript_editor/text_editor_kit.py +++ /dev/null @@ -1,348 +0,0 @@ -import re -from collections import Counter -from typing import NamedTuple, Protocol, runtime_checkable -from math import gcd - -from cedarscript_ast_parser import Marker, RelativeMarker, RelativePositionType, Segment, MarkerType, BodyOrWhole - -MATCH_TYPES = ('exact', 'stripped', 'normalized', 'partial') - -class MarkerMatchResult(NamedTuple): - match_type: str - index: int - indent: int - - def __str__(self): - return f"{self.match_type.lower()} @ {self.index} ({self.indent})" - - -class IndexBoundaries(NamedTuple): - start: MarkerMatchResult - end: MarkerMatchResult - - -class SearchRange(NamedTuple): - start: int - end: int - indent: int = 0 - - -class FunctionBoundaries(NamedTuple): - whole: SearchRange - body: SearchRange - # TODO Derive these 3 attrs from search ranges below - - @property - def start_line(self) -> int: - return self.whole.start + 1 - - @property - def body_start_line(self) -> int: - return self.body.start + 1 - - @property - def end_line(self) -> int: - return self.whole.end - - -def read_file(file_path: str) -> str: - with open(file_path, 'r') as file: - return file.read() - - -def write_file(file_path: str, lines: list[str]): - with open(file_path, 'w') as file: - file.writelines([line + '\n' for line in lines]) - -class IndentationInfo(NamedTuple): - char_count: int - char: str - min_indent_level: int - consistency: bool = True - message: str | None = None - - def level_difference(self, base_indentation_count: int): - return self.char_count_to_level(base_indentation_count) - self.min_indent_level - - def char_count_to_level(self, char_count: int) -> int: - return char_count // self.char_count - - def level_to_chars(self, level: int) -> str: - return level * self.char_count * self.char - - def adjust_indentation(self, lines: list[str], base_indentation_count: int) -> list[str]: - line_adjuster = self._adjust_indentation_fun(base_indentation_count) - # Return the transformed lines - return [line_adjuster(line) for line in lines] - - def _adjust_indentation_fun(self, base_indentation_count: int): - # Calculate the indentation difference - level_difference = self.level_difference(base_indentation_count) - - def adjust_line(line: str) -> str: - if not line.strip(): - # Handle empty lines or lines with only whitespace - return line - - current_indent = get_line_indent_count(line) - current_level = self.char_count_to_level(current_indent) - new_level = max(0, current_level + level_difference) - new_indent = self.level_to_chars(new_level) - - return new_indent + line.lstrip() - return adjust_line - -def get_line_indent_count(line: str): - return len(line) - len(line.lstrip()) - -def count_leading_chars(line: str, char: str) -> int: - return len(line) - len(line.lstrip(char)) - - -def normalize_line(line: str): - return re.sub(r'[^\w]', '.', line.strip(), flags=re.UNICODE) - - -def bow_to_search_range(bow: BodyOrWhole, searh_range: FunctionBoundaries | SearchRange | None = None, lines: list[str] | None = None) -> SearchRange: - match searh_range: - - case SearchRange() | None: - return searh_range or SearchRange(0, -1, 0) - - case FunctionBoundaries() as function_boundaries: - match bow: - case BodyOrWhole.BODY: - return function_boundaries.body - case BodyOrWhole.WHOLE: - return function_boundaries.whole - case _ as invalid: - raise ValueError(f"Invalid: {invalid}") - - case _ as invalid: - raise ValueError(f"Invalid: {invalid}") - - -# MarkerOrSegment - -# class MarkerOrSegmentProtocol(Protocol): -# def marker_or_segment_to_index_range(self) -> str: -# ... - - -@runtime_checkable -class MarkerOrSegmentProtocol(Protocol): - def marker_or_segment_to_index_range( - self, - lines: list[str], - search_start_index: int = 0, search_end_index: int = -1 - ) -> SearchRange: - ... - - -def marker_or_segment_to_index_range_impl( - self, - lines: list[str], - search_start_index: int = 0, search_end_index: int = -1 -) -> SearchRange | None: - match self: - case Marker(type=MarkerType.LINE): - result = find_line_index_and_indent(lines, self, search_start_index, search_end_index) - assert result, f"Unable to find `{self}`; Try: 1) Double-checking the marker (maybe you specified the the wrong one); or 2) using *exactly* the same characters from source; or 3) using another marker" - return SearchRange(result.index, result.index + 1, result.indent) - case Segment(start=s, end=e): - result = segment_to_indexes(lines, s, e, search_start_index, search_end_index) - return SearchRange(result.start.index, result.end.index, result.start.indent) - case _ as invalid: - raise ValueError(f"Unexpected type: {invalid}") - - -Marker.marker_or_segment_to_index_range = marker_or_segment_to_index_range_impl -Segment.marker_or_segment_to_index_range = marker_or_segment_to_index_range_impl - - -def find_line_index_and_indent( - lines: list[str], - search_term: Marker | RelativeMarker, - search_start_index: int = 0, search_end_index: int = -1 -) -> MarkerMatchResult | None: - """ - Find the index of a specified line within a list of strings, considering different match types and an offset. - - This function searches for a given line within a list, considering 4 types of matches in order of priority: - 1. Exact match - 2. Stripped match (ignoring leading and trailing whitespace) - 3. Normalized match (ignoring non-alphanumeric characters) - 4. Partial (Searching for a substring, using `casefold` to ignore upper- and lower-case differences. - - The function applies the offset across all match types while maintaining the priority order. - - :Args: - :param lines: The list of strings to search through. - :param search_term: - search_marker.value: The line to search for. - search_marker.offset: The number of matches to skip before returning a result. - 0 skips no match and returns the first match, 1 returns the second match, and so on. - :param search_start_index: The index to start the search from. Defaults to 0. - :param search_end_index: The index to end the search at (exclusive). - Defaults to -1, which means search to the end of the list. - - :returns: - MarkerMatchResult: The index for the desired line in the 'lines' list. - Returns None if no match is found or if the offset exceeds the number of matches within each category. - - :Example: - >> lines = ["Hello, world!", " Hello, world! ", "Héllo, wörld?", "Another line", "Hello, world!"] - >> _find_line_index(lines, "Hello, world!", 1) - 4 # Returns the index of the second exact match - - Note: - - The function prioritizes match types in the order: exact, stripped, normalized, partial. - - The offset is considered separately for each type. - """ - search_line = search_term.value - assert search_line, "Empty marker" - assert search_term.type == MarkerType.LINE, f"Invalid marker type: {search_term.type}" - - matches = {t: [] for t in MATCH_TYPES} - - stripped_search = search_line.strip() - normalized_search_line = normalize_line(stripped_search) - - if search_start_index < 0: - search_start_index = 0 - if search_end_index < 0: - search_end_index = len(lines) - - assert search_start_index < len(lines), f"search start index ({search_start_index}) must be less than line count ({len(lines)})" - assert search_end_index <= len(lines), f"search end index ({search_end_index}) must be less than or equal to line count ({len(lines)})" - - for i in range(search_start_index, search_end_index): - line = lines[i] - reference_indent = get_line_indent_count(line) - - # Check for exact match - if search_line == line: - matches['exact'].append((i, reference_indent)) - - # Check for stripped match - elif stripped_search == line.strip(): - matches['stripped'].append((i, reference_indent)) - - # Check for normalized match - elif normalized_search_line == normalize_line(line): - matches['normalized'].append((i, reference_indent)) - - # Last resort! - elif normalized_search_line.casefold() in normalize_line(line).casefold(): - matches['partial'].append((i, reference_indent)) - - offset = search_term.offset or 0 - for match_type in MATCH_TYPES: - if offset < len(matches[match_type]): - index, reference_indent = matches[match_type][offset] - match match_type: - case 'normalized': - print(f'Note: using {match_type} match for {search_term}') - case 'partial': - print(f"Note: Won't accept {match_type} match at index {index} for {search_term}") - continue - if isinstance(search_term, RelativeMarker): - match search_term.qualifier: - case RelativePositionType.BEFORE: - index += -1 - case RelativePositionType.AFTER: - index += 1 - case RelativePositionType.AT: - pass - case _ as invalid: - raise ValueError(f"Not implemented: {invalid}") - return MarkerMatchResult(match_type, index, reference_indent) - - return None - - -def segment_to_indexes( - lines: list[str], - start_relpos: RelativeMarker, end_relpos: RelativeMarker, - search_start_index: int = 0, search_end_index: int = -1 -) -> IndexBoundaries: - assert len(lines), "`lines` is empty" - - start_match_result = find_line_index_and_indent(lines, start_relpos, search_start_index, search_end_index) - assert start_match_result, f"Unable to find segment start \"{start_relpos}\"; Try: 1) Double-checking the marker (maybe you specified the the wrong one); or 2) using *exactly* the same characters from source; or 3) using a marker from above" - - end_match_result = find_line_index_and_indent(lines, end_relpos, start_match_result.index, search_end_index) - if end_match_result: - if end_match_result.index > -1: - end_match_result = end_match_result._replace(index=end_match_result.index+1) - assert end_match_result, f"Unable to find segment end \"{end_relpos}\" - Try: 1) using *exactly* the same characters from source; or 2) using a marker from below" - return IndexBoundaries(start_match_result, end_match_result) - - -def normalize_indent(content: str, context_indent_count: int = 0, indentation_info: IndentationInfo | None = None) -> list[str]: - # TODO Always send str? - lines = [line.lstrip() for line in content.splitlines() if line.strip()] if isinstance(content, str) else content - - context_indent_level = indentation_info.char_count_to_level(context_indent_count) - for i in range(len(lines)): - line = lines[i] - parts = line.split(':', 1) - if len(parts) == 2 and parts[0].startswith('@'): - relative_indent_level = int(parts[0][1:]) - absolute_indent_level = context_indent_level + relative_indent_level - assert absolute_indent_level >= 0, f"Final indentation for line `{line.strip()}` cannot be negative ({absolute_indent_level})" - lines[i] = indentation_info.level_to_chars(absolute_indent_level) + parts[1].lstrip() - else: - absolute_indent_level = context_indent_level - lines[i] = indentation_info.level_to_chars(absolute_indent_level) + line.lstrip() - - return lines - -def analyze_indentation(lines: list[str]) -> IndentationInfo: - - def extract_indentation(line: str) -> str: - return re.match(r'^\s*', line).group(0) - - indentations = [extract_indentation(line) for line in lines if line.strip()] - - if not indentations: - return IndentationInfo(4, ' ', 0, True, "No indentation found. Assuming 4 spaces (PEP 8).") - - indent_chars = Counter(indent[0] for indent in indentations if indent) - dominant_char = ' ' if indent_chars.get(' ', 0) >= indent_chars.get('\t', 0) else '\t' - - indent_lengths = [len(indent) for indent in indentations] - - if dominant_char == '\t': - char_count = 1 - else: - # For spaces, determine the most likely char_count - space_counts = [len for len in indent_lengths if len % 2 == 0 and len > 0] - if not space_counts: - char_count = 2 # Default to 2 if no even space counts - else: - # Sort top 5 space counts and find the largest GCD - sorted_counts = sorted([c[0] for c in Counter(space_counts).most_common(5)], reverse=True) - char_count = sorted_counts[0] - for i in range(1, len(sorted_counts)): - new_gcd = gcd(char_count, sorted_counts[i]) - if new_gcd <= 1: - break - char_count = new_gcd - - min_indent_chars = min(indent_lengths) if indent_lengths else 0 - min_indent_level = min_indent_chars // char_count - - consistency = all(len(indent) % char_count == 0 for indent in indentations if indent) - match dominant_char: - case ' ': - domcharstr = 'space' - case '\t': - domcharstr = 'tab' - case _: - domcharstr = dominant_char - message = f"Found {char_count}-{domcharstr} indentation" - if not consistency: - message += " (inconsistent)" - - return IndentationInfo(char_count, dominant_char, min_indent_level, consistency, message) diff --git a/src/text_manipulation/__init__.py b/src/text_manipulation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/text_manipulation/indentation_kit.py b/src/text_manipulation/indentation_kit.py new file mode 100644 index 0000000..f339dd0 --- /dev/null +++ b/src/text_manipulation/indentation_kit.py @@ -0,0 +1,236 @@ +import re +from collections import Counter +from collections.abc import Sequence +from math import gcd +from typing import NamedTuple + + +def get_line_indent_count(line: str): + return len(line) - len(line.lstrip()) + + +def extract_indentation(line: str) -> str: + """ + Extract the leading whitespace from a given line. + + Args: + line (str): The input line to process. + + Returns: + str: The leading whitespace of the line. + + Examples: + >>> extract_indentation(" Hello") + ' ' + >>> extract_indentation("\t\tWorld") + '\t\t' + >>> extract_indentation("No indentation") + '' + """ + return line[:len(line) - len(line.lstrip())] + + +class IndentationInfo(NamedTuple): + """ + A class to represent and manage indentation information. + + This class analyzes and provides utilities for working with indentation. + It detects the indentation character (space or tab), + the number of characters used for each indentation level, and provides + methods to adjust and normalize indentation. + + Attributes: + char_count (int): The number of characters used for each indentation level. + char (str): The character used for indentation (' ' for space, '\t' for tab). + min_indent_level (int): The minimum indentation level found in the analyzed content. + consistency (bool): Whether the indentation is consistent throughout the content. + message (str | None): A message describing the indentation analysis results. + + Class Methods: + from_content: Analyzes the indentation in the given content and creates an IndentationInfo instance. + + Methods: + level_difference: Calculates the difference in indentation levels. + char_count_to_level: Converts a character count to an indentation level. + level_to_chars: Converts an indentation level to a string of indentation characters. + shift_indentation: Adjusts the indentation of a sequence of lines. + apply_relative_indents: Applies relative indentation based on annotations in the content. + + Note: + This class is particularly useful for processing Python code with varying + or inconsistent indentation, and for adjusting indentation to meet specific + formatting requirements. + """ + char_count: int + char: str + min_indent_level: int + consistency: bool = True + message: str | None = None + + @classmethod + def from_content[T: IndentationInfo, S: Sequence[str]](cls: T, content: str | S) -> T: + """ + Analyzes the indentation in the given content and creates an IndentationInfo instance. + + This method examines the indentation patterns in the provided content, + determines the dominant indentation character and count, and assesses + the consistency of indentation throughout the content. + + Args: + content (str | Sequence[str]): The content to analyze. Can be a string + or a sequence of strings. + + Returns: + IndentationInfo: An instance of IndentationInfo with the analysis results. + + Note: + - If no indentation is found, it assumes 4 spaces as per PEP 8. + - For space indentation, it attempts to determine the most likely + character count by analyzing patterns and using GCD. + """ + # TODO Always send str? + lines = [x.lstrip() for x in content.splitlines() if x.strip()] if isinstance(content, str) else content + + indentations = [extract_indentation(line) for line in lines if line.strip()] + + if not indentations: + return cls(4, ' ', 0, True, "No indentation found. Assuming 4 spaces (PEP 8).") + + indent_chars = Counter(indent[0] for indent in indentations if indent) + dominant_char = ' ' if indent_chars.get(' ', 0) >= indent_chars.get('\t', 0) else '\t' + + indent_lengths = [len(indent) for indent in indentations] + + if dominant_char == '\t': + char_count = 1 + else: + # For spaces, determine the most likely char_count + space_counts = [sc for sc in indent_lengths if sc % 2 == 0 and sc > 0] + if not space_counts: + char_count = 2 # Default to 2 if no even space counts + else: + # Sort top 5 space counts and find the largest GCD + sorted_counts = sorted([c[0] for c in Counter(space_counts).most_common(5)], reverse=True) + char_count = sorted_counts[0] + for i in range(1, len(sorted_counts)): + new_gcd = gcd(char_count, sorted_counts[i]) + if new_gcd <= 1: + break + char_count = new_gcd + + min_indent_chars = min(indent_lengths) if indent_lengths else 0 + min_indent_level = min_indent_chars // char_count + + consistency = all(len(indent) % char_count == 0 for indent in indentations if indent) + match dominant_char: + case ' ': + domcharstr = 'space' + case '\t': + domcharstr = 'tab' + case _: + domcharstr = dominant_char + message = f"Found {char_count}-{domcharstr} indentation" + if not consistency: + message += " (inconsistent)" + + return cls(char_count, dominant_char, min_indent_level, consistency, message) + + def level_difference(self, base_indentation_count: int): + return self.char_count_to_level(base_indentation_count) - self.min_indent_level + + def char_count_to_level(self, char_count: int) -> int: + return char_count // self.char_count + + def level_to_chars(self, level: int) -> str: + return level * self.char_count * self.char + + def shift_indentation(self, lines: Sequence[str], target_base_indentation_count: int) -> list[str]: + """ + Shifts the indentation of a sequence of lines based on a base indentation count. + + This method adjusts the indentation of each non-empty line in the input sequence. + It calculates the difference between the base indentation and the minimum + indentation found in the content, then applies this shift to all lines. + + Args: + lines (Sequence[str]): A sequence of strings representing the lines to be adjusted. + target_base_indentation_count (int): The base indentation count to adjust from. + + Returns: + list[str]: A new list of strings with adjusted indentation. + + Note: + - Empty lines and lines with only whitespace are preserved as-is. + - The method uses the IndentationInfo of the instance to determine + the indentation character and count. + - This method is useful for uniformly adjusting indentation across all lines. + """ + raw_line_adjuster = self._shift_indentation_fun(target_base_indentation_count) + # Return the transformed lines + return [raw_line_adjuster(line) for line in lines] + + def _shift_indentation_fun(self, target_base_indentation_count: int): + # Calculate the indentation difference + level_difference = self.level_difference(target_base_indentation_count) + + def adjust_line(line: str) -> str: + if not line.strip(): + # Handle empty lines or lines with only whitespace + return line + + current_indent_count = get_line_indent_count(line) + current_level = self.char_count_to_level(current_indent_count) + new_level = max(0, current_level + level_difference) + new_indent = self.level_to_chars(new_level) + + return new_indent + line.lstrip() + return adjust_line + + def apply_relative_indents[S: Sequence[str]](self, content: str | S, context_indent_count: int = 0) -> list[str]: + """ + Applies relative indentation based on annotations in the content. + + This method processes the input content, interpreting special annotations + to apply relative indentation. It uses '@' followed by a number to indicate + relative indentation levels. + + Args: + content (str | Sequence[str]): The content to process. Can be a string + or a sequence of strings. + context_indent_count (int, optional): The base indentation count of the + context. Defaults to 0. + + Returns: + list[str]: A new list of strings with normalized indentation (without the annotations) + + Note: + - Lines starting with '@n:' (where n is an integer) are interpreted as + having a relative indentation of n levels from the context indent level. + - Empty lines and lines with only whitespace are removed. + - The method uses the IndentationInfo of the instance to determine + the indentation character and count. + - This method is particularly useful for content with varying + indentation levels specified by annotations. + + Raises: + AssertionError: If the calculated indentation level for any line is negative. + """ + # TODO Always send str? + lines = [line.lstrip() for line in content.splitlines() if line.strip()] if isinstance(content, str) else content + + context_indent_level = self.char_count_to_level(context_indent_count) + for i in range(len(lines)): + line = lines[i] + parts = line.split(':', 1) + if len(parts) == 2 and parts[0].startswith('@'): + relative_indent_level = int(parts[0][1:]) + absolute_indent_level = context_indent_level + relative_indent_level + assert absolute_indent_level >= 0, f"Final indentation for line `{line.strip()}` cannot be negative ({absolute_indent_level})" + lines[i] = self.level_to_chars(absolute_indent_level) + parts[1].lstrip() + else: + absolute_indent_level = context_indent_level + lines[i] = self.level_to_chars(absolute_indent_level) + line.lstrip() + + return lines + + diff --git a/src/text_manipulation/range_spec.py b/src/text_manipulation/range_spec.py new file mode 100644 index 0000000..5d882a4 --- /dev/null +++ b/src/text_manipulation/range_spec.py @@ -0,0 +1,195 @@ +import re +from collections.abc import Sequence +from typing import NamedTuple + +from cedarscript_ast_parser import Marker, RelativeMarker, RelativePositionType, MarkerType, BodyOrWhole +from text_manipulation.indentation_kit import get_line_indent_count + +MATCH_TYPES = ('exact', 'stripped', 'normalized', 'partial') + + +class RangeSpec(NamedTuple): + start: int + end: int + indent: int = 0 + + def __str__(self): + return (f'{self.start}:{self.end}' if self.as_index is None else f'%{self.as_index}') + f'@{self.indent}' + + def __len__(self): + return self.end - self.start + + @property + def as_index(self) -> int | None: + return None if len(self) else self.start + + @property + def collapsed(self): + return self.set_length(0) + + def set_length(self, range_len: int): + return self._replace(end=self.start + range_len) + + def inc(self, count: int = 1): + return self._replace(start=self.start + count, end=self.end + count) + + def dec(self, count: int = 1): + return self._replace(start=self.start - count, end=self.end - count) + + def read[S: Sequence[str]](self, src: S) -> S: + return src[self.start:self.end] + + def write[S: Sequence[str]](self, src: S, target: S): + target[self.start:self.end] = src + + def delete[S: Sequence[str]](self, src: S) -> S: + result = self.read(src) + del src[self.start:self.end] + return result + + @staticmethod + def normalize_line(line: str): + return re.sub(r'[^\w]', '.', line.strip(), flags=re.UNICODE) + + @classmethod + def from_line_marker[T: RangeSpec]( + cls: T, + lines: Sequence[str], + search_term: Marker, + search_range: 'RangeSpec' = None + ) -> T | None: + """ + Find the index of a specified line within a list of strings, considering different match types and an offset. + + This function searches for a given line within a list, considering 4 types of matches in order of priority: + 1. Exact match + 2. Stripped match (ignoring leading and trailing whitespace) + 3. Normalized match (ignoring non-alphanumeric characters) + 4. Partial (Searching for a substring, using `casefold` to ignore upper- and lower-case differences). + + The function applies the offset across all match types while maintaining the priority order. + + :Args: + :param lines: The list of strings to search through. + :param search_term: + search_marker.value: The line to search for. + search_marker.offset: The number of matches to skip before returning a result. + 0 skips no match and returns the first match, 1 returns the second match, and so on. + :param search_range: The index to start the search from. Defaults to 0. The index to end the search at (exclusive). + Defaults to (0, -1), which means search to the end of the list. + + :returns: + RangeSpec: The index for the desired line in the 'lines' list. + Returns None if no match is found or if the offset exceeds the number of matches within each category. + + :Example: + >> lines = ["Hello, world!", " Hello, world! ", "Héllo, wörld?", "Another line", "Hello, world!"] + >> _find_line_index(lines, "Hello, world!", 1) + 4 # Returns the index of the second exact match + + Note: + - The function prioritizes match types in the order: exact, stripped, normalized, partial. + - The offset is considered separately for each type. + """ + search_start_index, search_end_index, _ = search_range if search_range is not None else (0, -1, 0) + search_line = search_term.value + assert search_line, "Empty marker" + assert search_term.type == MarkerType.LINE, f"Invalid marker type: {search_term.type}" + + matches = {t: [] for t in MATCH_TYPES} + + stripped_search = search_line.strip() + normalized_search_line = cls.normalize_line(stripped_search) + + if search_start_index < 0: + search_start_index = 0 + if search_end_index < 0: + search_end_index = len(lines) + + assert search_start_index < len(lines), f"search start index ({search_start_index}) must be less than line count ({len(lines)})" + assert search_end_index <= len(lines), f"search end index ({search_end_index}) must be less than or equal to line count ({len(lines)})" + + for i in range(search_start_index, search_end_index): + line = lines[i] + reference_indent = get_line_indent_count(line) + + # Check for exact match + if search_line == line: + matches['exact'].append((i, reference_indent)) + + # Check for stripped match + elif stripped_search == line.strip(): + matches['stripped'].append((i, reference_indent)) + + # Check for normalized match + elif normalized_search_line == cls.normalize_line(line): + matches['normalized'].append((i, reference_indent)) + + # Last resort! + elif normalized_search_line.casefold() in cls.normalize_line(line).casefold(): + matches['partial'].append((i, reference_indent)) + + offset = search_term.offset or 0 + for match_type in MATCH_TYPES: + if offset < len(matches[match_type]): + index, reference_indent = matches[match_type][offset] + match match_type: + case 'normalized': + print(f'Note: using {match_type} match for {search_term}') + case 'partial': + print(f"Note: Won't accept {match_type} match at index {index} for {search_term}") + continue + if isinstance(search_term, RelativeMarker): + match search_term.qualifier: + case RelativePositionType.BEFORE: + index += -1 + case RelativePositionType.AFTER: + index += 1 + case RelativePositionType.AT: + pass + case _ as invalid: + raise ValueError(f"Not implemented: {invalid}") + return cls(index, index, reference_indent) + + return None + + +RangeSpec.EMPTY = RangeSpec(0, -1, 0) + + +class IdentifierBoundaries(NamedTuple): + whole: RangeSpec + body: RangeSpec + + def __str__(self): + return f'IdentifierBoundaries({self.whole} (BODY: {self.body}) )' + + @property + def start_line(self) -> int: + return self.whole.start + 1 + + @property + def body_start_line(self) -> int: + return self.body.start + 1 + + @property + def end_line(self) -> int: + return self.whole.end + + # See the other bow_to_search_range + def location_to_search_range(self, location: BodyOrWhole | RelativePositionType) -> RangeSpec: + match location: + case BodyOrWhole.BODY: + return self.body + case BodyOrWhole.WHOLE | RelativePositionType.AT: + return self.whole + case RelativePositionType.BEFORE: + return RangeSpec(self.whole.start, self.whole.start, self.whole.indent) + case RelativePositionType.AFTER: + return RangeSpec(self.whole.end, self.whole.end, self.whole.indent) + case RelativePositionType.INSIDE_TOP: + return RangeSpec(self.body.start, self.body.start, self.body.indent) + case RelativePositionType.INSIDE_BOTTOM: + return RangeSpec(self.body.end, self.body.end, self.body.indent) + case _ as invalid: + raise ValueError(f"Invalid: {invalid}") diff --git a/src/text_manipulation/text_editor_kit.py b/src/text_manipulation/text_editor_kit.py new file mode 100644 index 0000000..29e43f3 --- /dev/null +++ b/src/text_manipulation/text_editor_kit.py @@ -0,0 +1,92 @@ +from collections.abc import Sequence +from typing import Protocol, runtime_checkable + +from cedarscript_ast_parser import Marker, RelativeMarker, RelativePositionType, Segment, MarkerType, BodyOrWhole +from text_manipulation.range_spec import IdentifierBoundaries, RangeSpec + + +def read_file(file_path: str) -> str: + with open(file_path, 'r') as file: + return file.read() + + +def write_file(file_path: str, lines: Sequence[str]): + with open(file_path, 'w') as file: + file.writelines([line + '\n' for line in lines]) + + +# def count_leading_chars(line: str, char: str) -> int: +# return len(line) - len(line.lstrip(char)) + +def bow_to_search_range(bow: BodyOrWhole, searh_range: IdentifierBoundaries | RangeSpec | None = None) -> RangeSpec: + match searh_range: + + case RangeSpec() | None: + return searh_range or RangeSpec.EMPTY + + case IdentifierBoundaries(): + return searh_range.location_to_search_range(bow) + + case _ as invalid: + raise ValueError(f"Invalid: {invalid}") + + +# MarkerOrSegment + +# class MarkerOrSegmentProtocol(Protocol): +# def to_search_range(self) -> str: +# ... + + +@runtime_checkable +class MarkerOrSegmentProtocol(Protocol): + def marker_or_segment_to_index_range( + self, + lines: Sequence[str], + search_start_index: int = 0, search_end_index: int = -1 + ) -> RangeSpec: + ... + + +def marker_or_segment_to_search_range_impl( + self, + lines: Sequence[str], + search_range: RangeSpec = RangeSpec.EMPTY +) -> RangeSpec | None: + match self: + case Marker(type=MarkerType.LINE): + result = RangeSpec.from_line_marker(lines, self, search_range) + assert result is not None, f"Unable to find `{self}`; Try: 1) Double-checking the marker (maybe you specified the the wrong one); or 2) using *exactly* the same characters from source; or 3) using another marker" + # TODO check under which circumstances we should return a 1-line range instead of an empty range + return result + case Segment(start=s, end=e): + return segment_to_search_range(lines, s, e, search_range) + case _ as invalid: + raise ValueError(f"Unexpected type: {invalid}") + + +Marker.to_search_range = marker_or_segment_to_search_range_impl +Segment.to_search_range = marker_or_segment_to_search_range_impl + + +def segment_to_search_range( + lines: Sequence[str], + start_relpos: RelativeMarker, end_relpos: RelativeMarker, + search_range: RangeSpec = RangeSpec.EMPTY +) -> RangeSpec: + assert len(lines), "`lines` is empty" + + start_match_result = RangeSpec.from_line_marker(lines, start_relpos, search_range) + assert start_match_result, f"Unable to find segment start `{start_relpos}`; Try: 1) Double-checking the marker (maybe you specified the the wrong one); or 2) using *exactly* the same characters from source; or 3) using a marker from above" + + start_index_for_end_marker = start_match_result.as_index + if start_relpos.qualifier == RelativePositionType.AFTER: + start_index_for_end_marker += -1 + end_match_result = RangeSpec.from_line_marker(lines, end_relpos, RangeSpec(start_index_for_end_marker, search_range.end, start_match_result.indent)) + assert end_match_result, f"Unable to find segment end `{end_relpos}` - Try: 1) using *exactly* the same characters from source; or 2) using a marker from below" + if end_match_result.as_index > -1: + one_after_end = end_match_result.as_index + 1 + end_match_result = RangeSpec(one_after_end, one_after_end, end_match_result.indent) + return RangeSpec( + start_match_result.as_index, end_match_result.as_index, start_match_result.indent + )