Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flaky test_crawl_with_proxy #743

Open
janbuchar opened this issue Nov 25, 2024 · 2 comments
Open

Flaky test_crawl_with_proxy #743

janbuchar opened this issue Nov 25, 2024 · 2 comments
Labels
bug Something isn't working. debt Code quality improvement or decrease of technical debt. t-tooling Issues with this label are in the ownership of the tooling team.

Comments

@janbuchar
Copy link
Collaborator

Now that httpbin fails less often, we should look into the other flaky tests... this one fails quite often:

____________________________ test_crawl_with_proxy _____________________________
[gw0] linux -- Python 3.10.15 /home/runner/.cache/pypoetry/virtualenvs/crawlee-CKpluuj2-py3.10/bin/python

self = <curl_cffi.requests.session.AsyncSession object at 0x7fb471ec7df0>
method = 'GET'
url = 'https://janbuchar--httpbin.apify.actor/status/222?token=apify_api_xERCvoSw7a3SEVVclipJJPsfhGTFwZ3MKrI1'
params = None, data = None, json = None, headers = HttpHeaders(root={})
cookies = None, files = None, auth = None
timeout = <object object at 0x7fb475314500>, allow_redirects = True
max_redirects = None, proxies = None, proxy = None, proxy_auth = None
verify = None, referer = None, accept_encoding = 'gzip, deflate, br'
content_callback = None, impersonate = None, ja3 = None, akamai = None
extra_fp = None, default_headers = None, default_encoding = 'utf-8', quote = ''
http_version = None, interface = None, cert = None, stream = False
max_recv_speed = 0, multipart = None

    async def request(
        self,
        method: HttpMethod,
        url: str,
        params: Optional[Union[Dict, List, Tuple]] = None,
        data: Optional[Union[Dict[str, str], List[Tuple], str, BytesIO, bytes]] = None,
        json: Optional[dict] = None,
        headers: Optional[HeaderTypes] = None,
        cookies: Optional[CookieTypes] = None,
        files: Optional[Dict] = None,
        auth: Optional[Tuple[str, str]] = None,
        timeout: Optional[Union[float, Tuple[float, float], object]] = not_set,
        allow_redirects: Optional[bool] = None,
        max_redirects: Optional[int] = None,
        proxies: Optional[ProxySpec] = None,
        proxy: Optional[str] = None,
        proxy_auth: Optional[Tuple[str, str]] = None,
        verify: Optional[bool] = None,
        referer: Optional[str] = None,
        accept_encoding: Optional[str] = "gzip, deflate, br",
        content_callback: Optional[Callable] = None,
        impersonate: Optional[BrowserTypeLiteral] = None,
        ja3: Optional[str] = None,
        akamai: Optional[str] = None,
        extra_fp: Optional[Union[ExtraFingerprints, ExtraFpDict]] = None,
        default_headers: Optional[bool] = None,
        default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
        quote: Union[str, Literal[False]] = "",
        http_version: Optional[CurlHttpVersion] = None,
        interface: Optional[str] = None,
        cert: Optional[Union[str, Tuple[str, str]]] = None,
        stream: bool = False,
        max_recv_speed: int = 0,
        multipart: Optional[CurlMime] = None,
    ):
        """Send the request, see ``curl_cffi.requests.request`` for details on parameters."""
        self._check_session_closed()
    
        curl = await self.pop_curl()
        req, buffer, header_buffer, q, header_recved, quit_now = self._set_curl_options(
            curl=curl,
            method=method,
            url=url,
            params=params,
            data=data,
            json=json,
            headers=headers,
            cookies=cookies,
            files=files,
            auth=auth,
            timeout=timeout,
            allow_redirects=allow_redirects,
            max_redirects=max_redirects,
            proxies=proxies,
            proxy=proxy,
            proxy_auth=proxy_auth,
            verify=verify,
            referer=referer,
            accept_encoding=accept_encoding,
            content_callback=content_callback,
            impersonate=impersonate,
            ja3=ja3,
            akamai=akamai,
            extra_fp=extra_fp,
            default_headers=default_headers,
            quote=quote,
            http_version=http_version,
            interface=interface,
            stream=stream,
            max_recv_speed=max_recv_speed,
            multipart=multipart,
            cert=cert,
            queue_class=asyncio.Queue,
            event_class=asyncio.Event,
        )
        if stream:
            task = self.acurl.add_handle(curl)
    
            async def perform():
                try:
                    await task
                except CurlError as e:
                    rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
                    rsp.request = req
                    cast(asyncio.Queue, q).put_nowait(RequestException(str(e), e.code, rsp))
                finally:
                    if not cast(asyncio.Event, header_recved).is_set():
                        cast(asyncio.Event, header_recved).set()
                    # None acts as a sentinel
                    await cast(asyncio.Queue, q).put(None)
    
            def cleanup(fut):
                self.release_curl(curl)
    
            stream_task = asyncio.create_task(perform())
            stream_task.add_done_callback(cleanup)
    
            await cast(asyncio.Event, header_recved).wait()
    
            # Unlike threads, coroutines does not use preemptive scheduling.
            # For asyncio, there is no need for a header_parsed event, the
            # _parse_response will execute in the foreground, no background tasks running.
            rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
    
            first_element = _peek_aio_queue(cast(asyncio.Queue, q))
            if isinstance(first_element, RequestException):
                self.release_curl(curl)
                raise first_element
    
            rsp.request = req
            rsp.astream_task = stream_task
            rsp.quit_now = quit_now
            rsp.queue = q
            return rsp
        else:
            try:
                # curl.debug()
                # print("using curl instance: ", curl)
                task = self.acurl.add_handle(curl)
>               await task
E               curl_cffi.curl.CurlError: Failed to perform, curl: (16) . See https://curl.se/libcurl/c/libcurl-errors.html first for more details.

../../../.cache/pypoetry/virtualenvs/crawlee-CKpluuj2-py3.10/lib/python3.10/site-packages/curl_cffi/requests/session.py:1333: CurlError

The above exception was the direct cause of the following exception:

http_client = <crawlee.http_clients.curl_impersonate.CurlImpersonateHttpClient object at 0x7fb472034f10>
proxy = ProxyInfo(url='***127.0.0.1:51183', scheme='http', hostname='127.0.0.1', port=51183, username='user', password='pass', session_id=None, proxy_tier=None)
httpbin = URL('***')

    @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
    async def test_crawl_with_proxy(
        http_client: CurlImpersonateHttpClient,
        proxy: ProxyInfo,
        httpbin: URL,
    ) -> None:
        url = str(httpbin.copy_with(path='/status/222'))
        request = Request.from_url(url)
    
        async with Statistics() as statistics:
>           result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics)

tests/unit/http_clients/test_curl_impersonate.py:34: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
src/crawlee/http_clients/curl_impersonate.py:132: in crawl
    response = await client.request(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <curl_cffi.requests.session.AsyncSession object at 0x7fb471ec7df0>
method = 'GET'
url = 'https://janbuchar--httpbin.apify.actor/status/222?token=apify_api_xERCvoSw7a3SEVVclipJJPsfhGTFwZ3MKrI1'
params = None, data = None, json = None, headers = HttpHeaders(root={})
cookies = None, files = None, auth = None
timeout = <object object at 0x7fb475314500>, allow_redirects = True
max_redirects = None, proxies = None, proxy = None, proxy_auth = None
verify = None, referer = None, accept_encoding = 'gzip, deflate, br'
content_callback = None, impersonate = None, ja3 = None, akamai = None
extra_fp = None, default_headers = None, default_encoding = 'utf-8', quote = ''
http_version = None, interface = None, cert = None, stream = False
max_recv_speed = 0, multipart = None

    async def request(
        self,
        method: HttpMethod,
        url: str,
        params: Optional[Union[Dict, List, Tuple]] = None,
        data: Optional[Union[Dict[str, str], List[Tuple], str, BytesIO, bytes]] = None,
        json: Optional[dict] = None,
        headers: Optional[HeaderTypes] = None,
        cookies: Optional[CookieTypes] = None,
        files: Optional[Dict] = None,
        auth: Optional[Tuple[str, str]] = None,
        timeout: Optional[Union[float, Tuple[float, float], object]] = not_set,
        allow_redirects: Optional[bool] = None,
        max_redirects: Optional[int] = None,
        proxies: Optional[ProxySpec] = None,
        proxy: Optional[str] = None,
        proxy_auth: Optional[Tuple[str, str]] = None,
        verify: Optional[bool] = None,
        referer: Optional[str] = None,
        accept_encoding: Optional[str] = "gzip, deflate, br",
        content_callback: Optional[Callable] = None,
        impersonate: Optional[BrowserTypeLiteral] = None,
        ja3: Optional[str] = None,
        akamai: Optional[str] = None,
        extra_fp: Optional[Union[ExtraFingerprints, ExtraFpDict]] = None,
        default_headers: Optional[bool] = None,
        default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
        quote: Union[str, Literal[False]] = "",
        http_version: Optional[CurlHttpVersion] = None,
        interface: Optional[str] = None,
        cert: Optional[Union[str, Tuple[str, str]]] = None,
        stream: bool = False,
        max_recv_speed: int = 0,
        multipart: Optional[CurlMime] = None,
    ):
        """Send the request, see ``curl_cffi.requests.request`` for details on parameters."""
        self._check_session_closed()
    
        curl = await self.pop_curl()
        req, buffer, header_buffer, q, header_recved, quit_now = self._set_curl_options(
            curl=curl,
            method=method,
            url=url,
            params=params,
            data=data,
            json=json,
            headers=headers,
            cookies=cookies,
            files=files,
            auth=auth,
            timeout=timeout,
            allow_redirects=allow_redirects,
            max_redirects=max_redirects,
            proxies=proxies,
            proxy=proxy,
            proxy_auth=proxy_auth,
            verify=verify,
            referer=referer,
            accept_encoding=accept_encoding,
            content_callback=content_callback,
            impersonate=impersonate,
            ja3=ja3,
            akamai=akamai,
            extra_fp=extra_fp,
            default_headers=default_headers,
            quote=quote,
            http_version=http_version,
            interface=interface,
            stream=stream,
            max_recv_speed=max_recv_speed,
            multipart=multipart,
            cert=cert,
            queue_class=asyncio.Queue,
            event_class=asyncio.Event,
        )
        if stream:
            task = self.acurl.add_handle(curl)
    
            async def perform():
                try:
                    await task
                except CurlError as e:
                    rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
                    rsp.request = req
                    cast(asyncio.Queue, q).put_nowait(RequestException(str(e), e.code, rsp))
                finally:
                    if not cast(asyncio.Event, header_recved).is_set():
                        cast(asyncio.Event, header_recved).set()
                    # None acts as a sentinel
                    await cast(asyncio.Queue, q).put(None)
    
            def cleanup(fut):
                self.release_curl(curl)
    
            stream_task = asyncio.create_task(perform())
            stream_task.add_done_callback(cleanup)
    
            await cast(asyncio.Event, header_recved).wait()
    
            # Unlike threads, coroutines does not use preemptive scheduling.
            # For asyncio, there is no need for a header_parsed event, the
            # _parse_response will execute in the foreground, no background tasks running.
            rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
    
            first_element = _peek_aio_queue(cast(asyncio.Queue, q))
            if isinstance(first_element, RequestException):
                self.release_curl(curl)
                raise first_element
    
            rsp.request = req
            rsp.astream_task = stream_task
            rsp.quit_now = quit_now
            rsp.queue = q
            return rsp
        else:
            try:
                # curl.debug()
                # print("using curl instance: ", curl)
                task = self.acurl.add_handle(curl)
                await task
            except CurlError as e:
                rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
                rsp.request = req
                error = code2error(e.code, str(e))
>               raise error(str(e), e.code, rsp) from e
E               curl_cffi.requests.exceptions.HTTPError: Failed to perform, curl: (16) . See https://curl.se/libcurl/c/libcurl-errors.html first for more details.

../../../.cache/pypoetry/virtualenvs/crawlee-CKpluuj2-py3.10/lib/python3.10/site-packages/curl_cffi/requests/session.py:1338: HTTPError
@janbuchar janbuchar added bug Something isn't working. debt Code quality improvement or decrease of technical debt. labels Nov 25, 2024
@github-actions github-actions bot added the t-tooling Issues with this label are in the ownership of the tooling team. label Nov 25, 2024
@janbuchar
Copy link
Collaborator Author

Same goes for test_curl_impersonate.py::test_send_request_with_proxy, I suspect the reason will be the same.

@reproduce-bot
Copy link

The following script is generated by AI Agent to help reproduce the issue:

# crawlee-python/reproduce.py
import asyncio
import pytest
from unittest.mock import MagicMock

# Mock yarl URL
URL = MagicMock()
from unittest.mock import AsyncMock, MagicMock

# Mock the necessary components to avoid dependencies
CurlImpersonateHttpClient = MagicMock()
CurlImpersonateHttpClient.return_value.crawl = AsyncMock()
CurlImpersonateHttpClient.return_value.crawl.side_effect = Exception("Simulated network error")
ProxyInfo = MagicMock()
Request = MagicMock()
Statistics = MagicMock()
Statistics.__aenter__ = AsyncMock(return_value=Statistics)
Statistics.__aexit__ = AsyncMock(return_value=None)

async def test_crawl_with_proxy():
    http_client = CurlImpersonateHttpClient()
    proxy = ProxyInfo(url='http://127.0.0.1:51183', hostname='127.0.0.1', port=51183, username='user', password='pass')
    httpbin = URL('https://janbuchar--httpbin.apify.actor')

    url = str(httpbin.with_path('/status/222'))
    request = Request.from_url(url)

    async with Statistics() as statistics:
        try:
            result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics)
            assert result.http_response.status_code == 222
            print("Test passed successfully with no errors!")
        except Exception as e:
            raise AssertionError(e)

if __name__ == "__main__":
    asyncio.run(test_crawl_with_proxy())

How to run:

python3 crawlee-python/reproduce.py

Thank you for your valuable contribution to this project and we appreciate your feedback! Please respond with an emoji if you find this script helpful. Feel free to comment below if any improvements are needed.

Best regards from an AI Agent!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working. debt Code quality improvement or decrease of technical debt. t-tooling Issues with this label are in the ownership of the tooling team.
Projects
None yet
Development

No branches or pull requests

2 participants